# Test file to check if the environment is properly configured

In [1]:
import numpy as np
import pandas as pd
import xlrd as xl
from pandas import ExcelWriter
from pandas import ExcelFile
import pickle, re, json, os, datetime, time

import pprint
pp = pprint.PrettyPrinter(indent=4)

print('All libraries needed for data preprocessing successfully imported.')

All libraries needed for data preprocessing successfully imported.


In [2]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from operator import itemgetter
from functools import partial, update_wrapper
from openpyxl import load_workbook
from copy import deepcopy

from sklearn.externals.joblib import Parallel, delayed
from sklearn.pipeline import FeatureUnion, _fit_transform_one, _transform_one
from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from imblearn.over_sampling import SMOTE

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from imblearn.pipeline import Pipeline as Imb_Pipeline

from sklearn.preprocessing import FunctionTransformer

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score, make_scorer, confusion_matrix

## Ignore warnings
import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
np.warnings.filterwarnings('ignore')

print('All libraries needed for knowledge type prediction successfully imported.')

All libraries needed for knowledge type prediction successfully imported.


### If the output of the top 2 cells say:

`All libraries needed for data preprocessing successfully imported.`

### and

`All libraries needed for knowledge type prediction successfully imported.`

### Congratulations! Your environment is set up and sufficient to perform the experiments in this work!

# Let's take a look at the data provided in this repository

*all_data.pkl* contains the result of execution of the cells in *preprocess_data.ipynb*. It contains the text content of each of the sentences in the corpus, its annotated information type, as well as metadata information with respect to *Participants*, *Length*, *Structure*, *Temporal* and *Code* aspects as described in the paper describing this work.

In [3]:
## Read the data from the pickle file
all_data = pd.read_pickle('../data/all_data.pkl')
print("Size of corpus: "+str(len(all_data)))

Size of corpus: 4330


In [4]:
## Get the set of all unique knowledge types in the corpus

knowledge_types = list(set(all_data['Code']))
print("Number of unique knowledge types: "+str(len(knowledge_types)))
knowledge_types.sort()

pp.pprint(knowledge_types)

Number of unique knowledge types: 13
[   'Action on Issue',
    'Bug Reproduction',
    'Contribution and Commitment',
    'Expected Behaviour',
    'Investigation and Exploration',
    'Motivation',
    'Observed Bug Behaviour',
    'Potential New Issues and Requests',
    'Social Conversation',
    'Solution Discussion',
    'Task Progress',
    'Usage',
    'Workarounds']


In [5]:
projects = ['tensorflow','scikit-learn','spaCy']
for proj in projects:
    print("Number of sentences from "+proj+"issues: "+str(len(all_data[all_data.Document.str.contains(proj)])))

Number of sentences from tensorflowissues: 2100
Number of sentences from scikit-learnissues: 1401
Number of sentences from spaCyissues: 829


In [6]:
all_data.iloc[0:3]

Unnamed: 0,Document,Text Content,Code,Full Length,len,tloc,cloc,tpos1,tpos2,clen,tlen,ppau,npau,aa,begauth,has_code,first_turn,last_turn
0,1 37_tensorflow.doc,Node.js (JavaScript) Wrapper API,Expected Behaviour,32,32,0.5,0.00229358,0.0,1.0,1,0.0555556,0.0,0.000464767,NONE,True,False,True,False
1,1 37_tensorflow.doc,Because JavaScript is Awesome,Motivation,29,29,1.0,0.00458716,0.0,1.0,1,0.0555556,0.0,0.000464767,NONE,True,False,True,False
2,1 37_tensorflow.doc,+1!,Social Conversation,3,3,1.0,0.00688073,2.58297e-05,0.999974,1,0.0138889,0.000464767,0.000916033,NONE,False,False,False,False


### Let's now do a transformation on the data:

1. Drop *Full Length*
2. Convert *begauth* which contains values `True` and `False` to One Hot Encoding
3. Convert the time-based feature *tpos2* to a numeric field.

We'll store this converted data into a file called *just_testing.pkl*

In [7]:
## Drop "Full Length"
just_testing_data = all_data[['Document','Text Content','Code','len','tloc','cloc','tpos1','tpos2','clen','tlen','ppau','npau','aa','begauth','has_code','first_turn','last_turn']]

# Convert "begauth" which contains values `True` and `False` to One Hot Encoding
just_testing_data = pd.get_dummies(just_testing_data,columns = ['begauth'])

# Convert the time-based feature "tpos2" to a numeric field.
just_testing_data.tpos2.astype(int)

print('Done')

Done


In [8]:
just_testing_data.iloc[0:3]

Unnamed: 0,Document,Text Content,Code,len,tloc,cloc,tpos1,tpos2,clen,tlen,ppau,npau,aa,has_code,first_turn,last_turn,begauth_False,begauth_True
0,1 37_tensorflow.doc,Node.js (JavaScript) Wrapper API,Expected Behaviour,32,0.5,0.00229358,0.0,1.0,1,0.0555556,0.0,0.000464767,NONE,False,True,False,0,1
1,1 37_tensorflow.doc,Because JavaScript is Awesome,Motivation,29,1.0,0.00458716,0.0,1.0,1,0.0555556,0.0,0.000464767,NONE,False,True,False,0,1
2,1 37_tensorflow.doc,+1!,Social Conversation,3,1.0,0.00688073,2.58297e-05,0.999974,1,0.0138889,0.000464767,0.000916033,NONE,False,False,False,1,0


Notice that the field *Full Length* no longer exists and the field *begauth* has now been changed to *begauth_False* and *begauth_True*.

In [9]:
# Save transformed features in pickle file
just_testing_data.to_pickle('../data/just_testing.pkl')

# Predicting knowledge type of a sentence - a basic method

#### Building a Logistic Regression model to predict the knowledge type of a sentence based on textual content:
- The set of sentences are split into 2 parts - 80% are used for training and 20% used for testing the prediction model.
- The text content of the sentences is transformed into a vector format of the frequencies of words.
- Logistic Regression is used to train the model on the training data.
- The model then predicts on the test data and the precision, recall and f1-score of the predicted data with respect to the actual annotated knowledge type is displayed in a table.

In [10]:
X = all_data['Text Content']
y = all_data['Code']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, shuffle=True)

vectorizer = CountVectorizer()
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

clf = LogisticRegression()
model = clf.fit(X_train_tf, y_train)
y_pred = clf.predict(X_test_tf)

print(classification_report(y_test, y_pred))

                                   precision    recall  f1-score   support

                  Action on Issue       0.67      0.29      0.40         7
                 Bug Reproduction       0.46      0.33      0.38        55
      Contribution and Commitment       0.50      0.24      0.32        17
               Expected Behaviour       0.33      0.06      0.10        18
    Investigation and Exploration       0.34      0.23      0.28        65
                       Motivation       0.42      0.18      0.25        61
           Observed Bug Behaviour       0.42      0.16      0.23        31
Potential New Issues and Requests       0.40      0.20      0.27        49
              Social Conversation       0.56      0.82      0.67       172
              Solution Discussion       0.56      0.74      0.64       291
                    Task Progress       0.33      0.19      0.24        21
                            Usage       0.48      0.45      0.47        64
                      Wo

These results may seem okay, but this is just one scenario of a train-test split. There is no guarantee that the classifier will perform the same way if the split was made differently so that there were a different set of training and testing sentences. But that's okay. This is just to make sure your environment works.

### You can now continue to the main experiments in this paper. The _**README**_ will give you a better understanding of this repository's structure and how to proceed.