# Preprocess the sentence data and extract conversational data

In [1]:
import pandas as pd
import xlrd as xl
from pandas import ExcelWriter
from pandas import ExcelFile
import pickle
import re
import json
import os
import datetime, time

import pprint
pp = pprint.PrettyPrinter(indent=4)

In [2]:
from string import punctuation
from nltk.corpus import stopwords

### Removing punctuation and common contractions during preprocessing of the text
stop_words = list(punctuation) + ["'s","'m","n't","'re","-","'ll",'...'] #+ stopwords.words('english')

#### Code to preprocess the data in the text:

Preprocessing techniques:
1. replace code snippets (encompassed by '\`' or '\`\`\`') with token 'CODE'
2. replace URLs with token 'URL'
3. Remove any sort of references (i.e. line beginning with '>' or corresponding to defined regex)

In [3]:
def preprocess(line):
    ## Replace all code blocks with the token CODE
    pattern = re.compile(r'```[^```]*?(```|$)', re.MULTILINE|re.DOTALL)
    line = re.sub(pattern,'CODE',line)
    pattern = re.compile(r'`[^`]*?`', re.MULTILINE|re.DOTALL)
    line = re.sub(pattern,'CODE',line)
    pattern = re.compile(r'(^|\n)>.*')
    line = re.sub(pattern,'\nREFERENCE',line)
    pattern = re.compile(r'On [\d]+ \w+ [\d]+ at [\d]+:[\d]+, \w+ \w+ <.*?> wrote:')
    line = re.sub(pattern,'REFERENCE',line)
    pattern = re.compile(r'On \w+, \w+ \d+, \d+, \d+:\d+ (A|P)M \w+ \w+ .*? wrote:')
    line = re.sub(pattern,'REFERENCE',line)
    pattern = re.compile(r'([\n\t\s]*?REFERENCE)+',re.MULTILINE|re.DOTALL)
    line = re.sub(pattern,'\nREFERENCE',line)
    ## Replace URLs which are surrounded by brackets
    pattern = re.compile(r'[\[\(]http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+[\]\)]')
    line = re.sub(pattern,' URL ',line)
    return line

#### Read from Annotated Data File:

The file is an excel workbook with each sheet containing the sentences in the discussion thread of one issue. It was generated from the Atlas annotation tool.

In order to capture conversational features, it was necessary to be able to link the comment sentences back to the information present in the github API response. To do so, time and author of the comment was annotated as METADATA.

In [4]:
xl_file = pd.ExcelFile("../data/annotated_data_with_metadata.xlsx")
print("Total number of documents: "+str(len(xl_file.sheet_names)))
pp.pprint(xl_file.sheet_names)

Total number of documents: 15
[   '1 37_tensorflow.doc',
    '2 1122_tensorflow.doc',
    '3 197_spaCy.doc',
    '4 285_spaCy.doc',
    '5 429_spaCy.doc',
    '6 1585_scikit-learn.doc',
    '7 2889_scikit-learn.doc',
    '8 10521_scikit-learn.doc',
    '9 15_spaCy.doc',
    '10 7951_tensorflow.doc',
    '11 9393_scikit-learn.doc',
    '12 8191_tensorflow.doc',
    '13 125_spaCy.doc',
    '14 6665_scikit-learn.doc',
    '15 15604_tensorflow.doc']


#### Function helpers to get some of the conversational features:

In [5]:
import spacy
eng = spacy.load('en')
from spacy.lang.en import English
parser = English()

def str_to_datetime(date_string):
    return datetime.datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%SZ')

def time_between(date1, date2):
    return (date2 - date1)

word_count = lambda sentence: len([x for x in list(map(str,parser(sentence))) if x not in stop_words])

def longest_sentence_length(sentence_list):
    return max([word_count(sentence) for sentence in sentence_list])

#### Preprocess sentence content and extract conversational data:

This function also reads from the file containing the comments obtained in json format from the github API for the issue. This is to get meta-information such as the created_at time of the comment, the author, etc. Matching of the annotated sentence to the json response from which it was etracted was done using the timestamp in METADATA content as mentioned above. Matching text was not recommended due to formatting issues as well as duplicate sentences. Matching authors was also not possible due to the occasional change in usernames of a single user. Hence, the created_at timestamp of the comment is used.

The comment data is stored in a pandas dataframe with each row containing the Text Content and Conversational features.

It stores the dictionary of format `{filename: pandas comments dataframe}` in the file *data_by_document.pkl*.

It stores all the data in the file *all_data.pkl*.

In [6]:
data_by_document = {}
metadata_by_document = {}
all_data = pd.DataFrame(columns=['Document','Text Content','Code','Full Length','len','tloc','cloc','tpos1','tpos2','clen','tlen','ppau','npau','aa','begauth','has_code','first_turn','last_turn'])
total_length = 0

print("Number of single-coded quotations per document: ")
process_start_time = time.time()
for sheet_name in xl_file.sheet_names:
    sheet_start_time = time.time()
    print('-----'+sheet_name+'-----')
    issue_number, repo_name = sheet_name.split(' ')[1].split('_')
    repo_name = repo_name.split('.')[0]
    folder_name = repo_name+'_'+repo_name
    if repo_name == 'spaCy':
        folder_name = 'explosion_spaCy'
    folder_path = os.path.join('../data/chosen_issues',folder_name)

    ### Load the json file containing json api responses for each issue
    with open(os.path.join(folder_path,'issue'+issue_number+'.txt')) as f:
        json_data = json.load(f)

    quotation_data = xl_file.parse(sheet_name)
    document_quotations_and_codes = pd.DataFrame(columns=['Document','Text Content','Code','Full Length','len','tloc','cloc','tpos1','tpos2','clen','tlen','ppau','npau','aa','begauth','has_code','first_turn','last_turn'])
    metadata_document_quotations_and_codes = pd.DataFrame(columns=['Text Content','Code','Full Length','len','location'])

    ### Extract and preprocess all sentences
    for index, row in quotation_data.iterrows():
        ### Currently ignoring multi-labelled sentences and labels with < 50 data points
        if int(row['Number of Codes']) == 1 and row['Codes'] not in ['Future Plan','Content Management','Testing-Related']:
            pp_text = preprocess(row['Text Content'])
#             pp_text = row['Text Content']
            location, orig_length = row['Location'].split(' ')
            location = int(location.replace('location=',''))
            orig_length = int(orig_length.replace('length=',''))
            metadata_document_quotations_and_codes.loc[len(metadata_document_quotations_and_codes)] = [pp_text,row['Codes'],len(row['Text Content']),len(pp_text), location]
    metadata_by_document[sheet_name] = metadata_document_quotations_and_codes

    ### Sort all sentences by location in file because atlas.ti does not do this
    metadata_document_quotations_and_codes.sort_values(by=['location'],inplace=True)

    ### If no sentences found between two metadata tags - IGNORE the previous metadata
    prev_code = ''
    for index, row in metadata_document_quotations_and_codes.iterrows():
        if prev_code == 'METADATA' and row['Code'] == 'METADATA':
#             print(row['Text Content'])
            metadata_document_quotations_and_codes.loc[index-1,'Code'] = 'N/A - METADATA'
#             print(metadata_document_quotations_and_codes.loc[index-1])
        prev_code = row['Code']

    first_post_flag = 0
    cloc = 0
    tloc = 0
    prev_tloc = 0
    new_comment = 0

    ### Calculate all conversational features
    curr_post_time = ''
    beg_comment = 0
    comment_location = 0
    for index, row in metadata_document_quotations_and_codes.iterrows():
        if row['Code'] == 'METADATA':
            beg_comment = 1
            prev_comment_count = tloc
            prev_comm_start = len(document_quotations_and_codes) - (prev_comment_count)
            prev_comm_end = len(document_quotations_and_codes) - 1
            if cloc!=0:
                document_quotations_and_codes.loc[prev_comm_start:prev_comm_end,'tloc'] /= prev_comment_count
                longest_sent_length_in_prev_comment = longest_sentence_length(document_quotations_and_codes.loc[prev_comm_start:prev_comm_end]['Text Content'].values)
                document_quotations_and_codes.loc[prev_comm_start:prev_comm_end,'clen'] /= longest_sent_length_in_prev_comment
            prev_tloc = tloc
            tloc = 0
            most_recent_metadata = row['Text Content']
            prev_post_time = curr_post_time if curr_post_time else str_to_datetime(row['Text Content'].split(' ')[0])
            curr_post_time, curr_poster = row['Text Content'].split(' ')
            curr_post_time = str_to_datetime(curr_post_time)
            if first_post_flag == 0:
                first_post_time, first_poster = curr_post_time, curr_poster
            first_post_flag += 1
            json_comment = json_data[comment_location]
            ## Sanity Check to match metadata from atlas.ti with github API response
            if str_to_datetime(json_comment['created_at']) == curr_post_time:
                if (json_comment['user']['login'] == curr_poster):
                    #DO NOTHING
#                   print('yay')
                    yay = 1
                else:
                    print('Different Name: '+str(comment_location)+' '+json_comment['user']['login']+' '+str(curr_poster))
            else:
                print('Incorrect: '+str(comment_location)+' '+json_comment['created_at']+' '+str(curr_post_time))
            comment_location+=1

        elif row['Code'] == 'N/A - METADATA':
            comment_location+=1
        else:
            if beg_comment == 1:
                tloc = 1
            else:
                tloc += 1
            has_code = True if 'CODE' in pp_text else False
            first_turn = True if first_post_flag == 1 else False
            cloc += 1
            tpos1 = time_between(first_post_time,curr_post_time).total_seconds()/60
            tpos2 = curr_post_time
            ppau = time_between(prev_post_time, curr_post_time).total_seconds()/60
            begauth = True if curr_poster == first_poster else False
            aa = json_comment['author_association']
            num_words = word_count(row['Text Content'])
            clen = num_words
            tlen = num_words
            has_code = True if 'CODE' in row['Text Content'] else False
            document_quotations_and_codes.loc[len(document_quotations_and_codes),['Text Content','Code','Full Length','len','tloc','cloc','tpos1','tpos2','clen','tlen','ppau','aa','begauth','has_code','first_turn','last_turn']] = [row['Text Content'], row['Code'], row['Full Length'], row['len'],tloc,cloc,tpos1,tpos2,clen,tlen, ppau, aa,begauth,has_code,first_turn,False]
            if cloc != 1 and beg_comment == 1: document_quotations_and_codes.loc[prev_comm_start:prev_comm_end,'npau'] = ppau
            if beg_comment == 1: beg_comment = 0

    document_quotations_and_codes.loc[prev_comm_end+1:len(document_quotations_and_codes)+1,'last_turn'] = True
    document_quotations_and_codes.loc[prev_comm_end+1:len(document_quotations_and_codes)+1,'npau'] = time_between(curr_post_time, curr_post_time).total_seconds()/60
    last_post_time = curr_post_time
    total_time = time_between(first_post_time, last_post_time).total_seconds()/60
    num_sentences = len(document_quotations_and_codes)
    longest_sent_length = longest_sentence_length(document_quotations_and_codes['Text Content'].values)
    document_quotations_and_codes.tlen/=longest_sent_length
    document_quotations_and_codes.tpos2=time_between(document_quotations_and_codes.tpos2,last_post_time).dt.total_seconds()/60
    document_quotations_and_codes.tpos1/=total_time
    document_quotations_and_codes.tpos2/=total_time
    document_quotations_and_codes.cloc/=num_sentences
    document_quotations_and_codes.ppau/=document_quotations_and_codes.ppau.max()
    document_quotations_and_codes.npau/=document_quotations_and_codes.npau.max()
    document_quotations_and_codes.Document = sheet_name
    data_by_document[sheet_name] = document_quotations_and_codes

    print("Total number of sentences: "+str(num_sentences))
    print("Time taken: "+str(datetime.timedelta(seconds=(time.time()-sheet_start_time))))

    total_length += len(document_quotations_and_codes)
    all_data = all_data.append(document_quotations_and_codes)

# print(all_data.iloc[0:3])
print('\nTotal size of singly-labelled data: '+str(total_length))
print('Total size of singly-labelled data: '+str(len(all_data)))
print("Total Time taken: "+str(datetime.timedelta(seconds=(time.time()-process_start_time))))

# Save preprocessed text in pickle file
with open('../data/data_by_document.pkl', 'wb') as handle:
    pickle.dump(data_by_document, handle)

all_data.to_pickle('../data/all_data.pkl')

Number of single-coded quotations per document: 
-----1 37_tensorflow.doc-----
Different Name: 54 andrewjaykeller aj-ptw
Different Name: 126 aakreidler Tim15
Different Name: 216 madnai DecafManiac
Total number of sentences: 436
Time taken: 0:00:03.836733
-----2 1122_tensorflow.doc-----
Total number of sentences: 431
Time taken: 0:06:37.870672
-----3 197_spaCy.doc-----
Total number of sentences: 136
Time taken: 0:00:00.893212
-----4 285_spaCy.doc-----
Total number of sentences: 220
Time taken: 0:00:01.335591
-----5 429_spaCy.doc-----
Total number of sentences: 86
Time taken: 0:00:25.572686
-----6 1585_scikit-learn.doc-----
Total number of sentences: 271
Time taken: 0:00:02.121975
-----7 2889_scikit-learn.doc-----
Total number of sentences: 329
Time taken: 0:00:02.599940
-----8 10521_scikit-learn.doc-----
Total number of sentences: 250
Time taken: 0:00:01.941520
-----9 15_spaCy.doc-----
Total number of sentences: 108
Time taken: 0:00:00.906870
-----10 7951_tensorflow.doc-----
Different N

### Sample Values:

In [7]:
print(data_by_document['5 429_spaCy.doc'].iloc[0:3])
print("-----------------------------------------------")
print(data_by_document['5 429_spaCy.doc'].iloc[0:6]['Text Content'].values)

          Document                                       Text Content  \
0  5 429_spaCy.doc               pipe(): ValueError Error parsing doc   
1  5 429_spaCy.doc  I found strange behaviour using the CODE metho...   
2  5 429_spaCy.doc  If you parse a document using CODE you can get...   

                     Code Full Length  len      tloc       cloc tpos1  tpos2  \
0  Observed Bug Behaviour          36   36  0.111111  0.0116279     0    1.0   
1  Observed Bug Behaviour          86   82  0.222222  0.0232558     0    1.0   
2  Observed Bug Behaviour         111  100  0.333333  0.0348837     0    1.0   

   clen       tlen ppau        npau    aa begauth has_code first_turn  \
0  0.25  0.0526316    0  0.00859827  NONE    True    False       True   
1  0.65   0.136842    0  0.00859827  NONE    True     True       True   
2     1   0.210526    0  0.00859827  NONE    True     True       True   

  last_turn  
0     False  
1     False  
2     False  
-------------------------------------

In [8]:
print(data_by_document['8 10521_scikit-learn.doc'].iloc[0:6]['Text Content'].values)

['Rethinking the CategoricalEncoder API ?'
 'Based on some discussions we are having here and issues that are opened, we are having some doubts that CODE  URL  was the good choice of name (and since it is not released yet we have some room for change).'
 'So summary of how it is now:'
 '-         The class name CODE says what type of data it accepts (categorical data)'
 '-         The keyword argument CODE specifies *how* to encode those data'
 'But what to do in the following cases:']


In [9]:
print(data_by_document['10 7951_tensorflow.doc'].iloc[0:6]['Text Content'].values)

["[Enhancement] Redesigning TensorFlow's input pipelines"
 "[**TL;DR:** We're designing a new input pipeline API for TensorFlow, and we'd like to collect your feature requests on this issue.]"
 "We've noticed that one of the biggest challenges in getting started with TensorFlow is how to load your own data into your programs."
 'While TensorFlow has several methods that can be used to build complex input pipelines (such as [CODE] URL , [CODE] URL , etc.), they were designed for a particular use case (processing a static set of files repeatedly), and the average user experience with these methods is not great.'
 'For example:'
 '*         Once you reach the end of a pipeline, it becomes closed and you can never use it again in the same session.']


In [10]:
print(data_by_document['11 9393_scikit-learn.doc'].iloc[0:6]['Text Content'].values)

['Debian test failures (was test_preserve_trustworthiness_approximately fails on 32bit: AssertionError: 0.89166666666666661 not greater than 0.9)'
 'building 0.19b2 on debian/ubuntus ...'
 'still ongoing but I see consistent failure on Debian stretch (nd90, current stable) and testing (nd100), 32bit only (ok on amd64 build):CODE'
 'in both cases python-numpy is CODE (i.e. 1.12.1 numpy) and passed ok with numpy 1.8.2 in Debian jessie.'
 'ping @ogrisel?'
 "Interesting, it's a only on a combo of numpy 1.12.1 and 32 bit python..."]
