# File for extracting only pull requests of active users

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt 

In [2]:
import scipy.stats as stats
from collections import defaultdict
from scipy.stats.stats import pearsonr

In [3]:
import csv

# Pre-processing

### Pull requests comments dataset

In [4]:
pr_comments_original = pd.read_csv('pull_request_comments.csv', header=None, on_bad_lines='skip', nrows=1000, usecols = [1,4]) 

In [8]:
# columns name as indicated in the GHTorrent dataset
pr_comments_original.columns =['user_id' , 'body']


In [9]:
pr_comments_original.head() 

Unnamed: 0,user_id,body
0,66821,I think auto-submitting to cause a full page r...
1,67072,I agree with Brad; either the *complete* exper...
2,2065,Added test to ensure single class is passed. M...
3,3195768,
4,10578,"<meta charset=\utf-8\"" />"""


In [10]:

pr_comments_original.sort_values(by=['user_id']).head()

Unnamed: 0,user_id,body
72,but you could say,
895,"level2\""",
977,please use the singular form. Only one note p...,
423,stroke: BLUE }`\,
529,1000343,


In [11]:
pr_comments = pr_comments_original.copy()
#all the changes are done on this copy of the dataset

In [12]:
pr_comments.isna().sum()

user_id    43
body       65
dtype: int64

In [13]:
pr_comments.sort_values(by=['user_id']).head()

Unnamed: 0,user_id,body
72,but you could say,
895,"level2\""",
977,please use the singular form. Only one note p...,
423,stroke: BLUE }`\,
529,1000343,


In [14]:
pr_comments.shape

(1000, 2)

In [15]:
pr_comments.dtypes

user_id    object
body       object
dtype: object

In [16]:
#Convert user_id from object to float and then to integer
pr_comments['user_id']=pd.to_numeric(pr_comments['user_id'], errors='coerce', downcast='integer')

In [17]:
pr_comments.dtypes

user_id    float64
body        object
dtype: object

In [18]:
# Drop missing values because either if user or body is missing, the record can't be used
pr_comments.dropna(inplace=True)

In [20]:
pr_comments['user_id'] = pr_comments['user_id'].astype(int)

In [21]:
pr_comments.dtypes

user_id     int32
body       object
dtype: object

In [22]:
pr_comments.shape

(935, 2)

In [25]:
# check for only string values in the body (the user_id information can be shifted)
pr_comments = pr_comments[pr_comments['body'].str.contains('[A-Za-z]')]

In [26]:
pr_comments.shape

(930, 2)

### Merging dataset: we are only interested in analysing pull requests of active users

In [37]:
# we use the information of user_id to match with the pr comments
active_user_login = pd.read_csv('active_users_login.csv', on_bad_lines='skip') 

In [38]:
active_user_login.head()

Unnamed: 0.1,Unnamed: 0,user_id,login
0,0,2,jmettraux
1,1,5,weppos
2,2,6,anb
3,3,10,mcollina
4,4,24,jswhit


In [39]:
active_user_login.drop(columns = 'Unnamed: 0', axis=1, inplace=True)

In [40]:
active_user_login.sort_values(by='user_id').head()

Unnamed: 0,user_id,login
0,2,jmettraux
1,5,weppos
2,6,anb
3,10,mcollina
4,24,jswhit


In [36]:
pr_comments.sort_values(by=['user_id']).head(14)

Unnamed: 0,user_id,body
189,217,only thing that's needed here is passthrough(x...
209,617,This line doesn't do anything?!1
206,617,This assumes `navigator.id.getVerifiedEmail` w...
222,617,"You don't actually use `i`, or `arr` here, I d..."
208,617,Nit: should use `jQuery.Deferred` instead of `...
211,617,SuperNit: space after `)`:\\n\\n```js\\nvar Ba...
217,617,Not sure this should throw. That means nothing...
622,707,Also don't know how this happened.
626,707,./tinyformat.h:557:31: warning: implicit conve...
627,707,C++11 allows you to specify the underlying typ...


In [41]:
# merge between pr_comments and active user login
inner_merged = pd.merge(pr_comments, active_user_login, on=['user_id'])

In [42]:
inner_merged.sort_values(by=['user_id']).head(16)

Unnamed: 0,user_id,body,login
121,217,only thing that's needed here is passthrough(x...,lloyd
126,617,Not sure this should throw. That means nothing...,brianloveswords
125,617,SuperNit: space after `)`:\\n\\n```js\\nvar Ba...,brianloveswords
124,617,This line doesn't do anything?!1,brianloveswords
123,617,Nit: should use `jQuery.Deferred` instead of `...,brianloveswords
122,617,This assumes `navigator.id.getVerifiedEmail` w...,brianloveswords
127,617,"You don't actually use `i`, or `arr` here, I d...",brianloveswords
248,733,Might be useful to pass the `{asList: true}` o...,ericf
246,733,"I think prefixing the modules with \mvc-\"" and...",ericf
247,733,This could become an Attribute of the app (tha...,ericf


In [43]:
res = inner_merged.groupby(['user_id'], as_index=False)[['body', 'login']]\
      .agg(lambda x: ', '.join(map(str, set(x))))

In [44]:
res.head(20)

Unnamed: 0,user_id,body,login
0,217,only thing that's needed here is passthrough(x...,lloyd
1,617,"You don't actually use `i`, or `arr` here, I d...",brianloveswords
2,733,This could become an Attribute of the app (tha...,ericf
3,858,could you rename this variable to `writable_di...,willdurand
4,859,"no need to extend the other class, you reverte...",stof
5,866,"ditto - and the next one, s/~~are requiring/re...",garyrussell
6,884,I like 'adapter' and we already have (with qui...,olegz
7,1561,Do you want to try moving the unicode characte...,brianmario
8,1586,"@Freeaqingme This is standard getopt usage, no...",weierophinney
9,1590,Probably ZendService should have his own index...,Maks3w


## Extract new file

In [57]:
res.to_csv('C:/Users/costa/Onedrive/Desktop/GITHUB/Processed/active_user_pr_comments.csv') 

In [63]:
res.to_excel('C:/Users/costa/Onedrive/Desktop/GITHUB/Processed/active_user_pr_comments.xlsx')