# Feature Engineering Github Data

In [31]:
# import libraries
import pandas as pd
import numpy as np

In [32]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [33]:
# read the data
df = pd.read_csv("../data/total.csv")
#df = pd.read_csv("../data/raw_data.csv")

In [34]:
# checking the data shape
df.name.value_counts()

homebrew                               46737
rails                                  46135
xbmc                                   45667
mono                                   44282
mongo                                  23904
TrinityCore                            23898
zf2                                    22578
symfony                                21154
django                                 20477
scala                                  19180
diaspora                               15625
cakephp                                15571
node                                   13257
akka                                   12069
CodeIgniter                             8755
jquery                                  8521
openFrameworks                          8367
gitlabhq                                7869
ravendb                                 7796
netty                                   7633
MaNGOS                                  7252
three.js                                7180
elasticsea

### One hot-encoding

In [35]:
# import preprocessing from sklearn
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [36]:
df.language.value_counts()

Ruby            131266
PHP              77864
C                73122
C#               70584
C++              67798
Python           52618
Scala            44034
JavaScript       41450
Java             29195
TypeScript        4904
R                 4313
CSS               2910
Go                 905
CoffeeScript        76
Perl                32
Shell                2
Name: language, dtype: int64

In [37]:
# encoding these columns that are 0/1
df['private'] = le.fit_transform(df.private.values)
df['fork'] = le.fit_transform(df.fork.values)
df['has_issues'] = le.fit_transform(df.has_issues.values)
df['has_downloads'] = le.fit_transform(df.has_downloads.values)
df['has_wiki'] = le.fit_transform(df.has_wiki.values)
df['admin'] = le.fit_transform(df.admin.values)
df['push'] = le.fit_transform(df.push.values)
df['pull'] = le.fit_transform(df.pull.values)

In [38]:
# encoding the language categorical feature
#df = pd.get_dummies(df, columns = ["language", "label", "type"])
df = pd.get_dummies(df, columns = ["language", "type"])

In [39]:
df.shape

(601077, 51)

### Total Files

In [40]:
# create a new column called total_files that sums up the filenames column
df['total_files'] = df.apply(lambda row: row.filenames.count(',') + 1, axis=1)

### Y Creation

In [41]:
# use that paper to classify the commits as being defective of not
df['results'] = np.where(df['message'].str.lower().str.contains("bug|fix|error|wrong|fail|problem|patch"), 
                         1, 
                         0)

#### Delete null date column and date that is equal to 1

In [42]:
df = df[pd.notnull(df['date'])]
df = df.drop(df[(df.date == '1')].index)

### Order by the name and date

In [43]:
df = df.sort_values('name')
df = df.sort_values('date')

#### Reset the Index

In [44]:
df = df.reset_index(drop=True)

### Find the Weekend

In [45]:
import datetime
import re

def transformStringDate(strDate):
    match = re.search(r'\d{4}-\d{2}-\d{2}', strDate)
    date = datetime.datetime.strptime(match.group(), '%Y-%m-%d').date()
    return date.strftime('%A')

In [46]:
df['date_only'] = df['date'].apply(transformStringDate)

In [47]:
def weekendcheck(s):
    if s == 'Saturday' or s == 'Sunday': 
        return 1
    else:
        return 0

df['is_weekend'] = df['date_only'].apply(weekendcheck)

### Find the night

In [48]:
def isnight(datetimeobject):
    match = re.search('\d{2}:\d{2}:\d{2}', datetimeobject).group(0)
    first_two_letters = match[:2]
    if int(first_two_letters) > 20 or int(first_two_letters) < 6:
        return(1)
    else:
        return(0)

In [49]:
df['is_night'] = df['date'].apply(isnight)

### Extract the Owner

In [50]:
def checkOwner(fullname):
    data = fullname.split("/")
    
    if(data[0] == data[1]):
        return(0)
    else:
        return(1)

In [51]:
df['outside_contribution'] = df['fullname'].apply(checkOwner)
#df['outside_contribution'] = df.apply(lambda x: checkOwner(x.fullname, x.owner), axis=1)

### Check for Tests

In [52]:
def checkTests(filesCommitted): 
    #for names in filesCommitted:
    if (filesCommitted.find('test') != -1 or filesCommitted.find('Test') != -1): 
        return 1 

    return 0

In [53]:
df['tests_included'] = df['filenames'].apply(checkTests)

In [54]:
df.tests_included.value_counts()

0    454950
1    146127
Name: tests_included, dtype: int64

### Check the Changes

In [55]:
from collections import Counter

def checkChanges(fileStatus): 
    list_files = fileStatus.split(",")
    occurence = Counter(list_files).most_common(1)
    tupleOccurences = occurence[0]
    if (tupleOccurences[0].find('modified') != -1): 
        return 'modified'
    elif (tupleOccurences[0].find('added') != -1): 
        return 'added'
    elif (tupleOccurences[0].find('removed') != -1): 
        return 'removed'
    elif (tupleOccurences[0].find('renamed') != -1): 
        return 'renamed'
    else:
        return 'unknown'

In [56]:
df['most_changes'] = df['status'].apply(checkChanges)

In [57]:
df.most_changes.unique()

array(['added', 'modified', 'removed', 'renamed', 'unknown'], dtype=object)

In [58]:
# encoding the language categorical feature
df = pd.get_dummies(df, columns = ["most_changes"])

In [59]:
df.head(5)

Unnamed: 0,sha,message,comment_count,date,total_deletions,total_additions,total,comments,additions,deletions,changes,status,filenames,name,fullname,private,fork,size,watchers_count,has_issues,has_downloads,has_wiki,forks_count,open_issues_count,forks,open_issues,watchers,network_count,admin,push,pull,label,owner,language_C,language_C#,language_C++,language_CSS,language_CoffeeScript,language_Go,language_Java,language_JavaScript,language_PHP,language_Perl,language_Python,language_R,language_Ruby,language_Scala,language_Shell,language_TypeScript,type_Organization,type_User,total_files,results,date_only,is_weekend,is_night,outside_contribution,tests_included,most_changes_added,most_changes_modified,most_changes_removed,most_changes_renamed,most_changes_unknown
0,e566ca34a3fd246a4535c458ae47ff7ea62e2de7,New repository initialized by cvs2svn.,0,2003-02-13T13:38:33Z,0,173,173,[],"[1, 8, 8, 155, 1]","[0, 0, 0, 0, 0]","[1, 8, 8, 155, 1]","[u'added', u'added', u'added', u'added', u'add...","[u'.gitignore', u'pull-binary-libs.sh', u'push...",scala,scala/scala,0,0,100982,1610,1,1,1,506,15,506,15,1610,506,0,0,0,reviewed,scala,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,5,0,Thursday,0,0,0,0,1,0,0,0,0
1,33d6e170c97ca7b2f991896a0729941a7240b6d6,- Added .cvsignore,0,2003-02-13T13:38:33Z,0,4,4,[],[4],[0],[4],[u'added'],[u'.cvsignore'],scala,scala/scala,0,0,100982,1610,1,1,1,506,15,506,15,1610,506,0,0,0,reviewed,scala,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,Thursday,0,0,0,0,1,0,0,0,0
2,4177daab2f54bdb20c71f623296a8bb32616fd12,Initial version.,0,2003-02-13T14:41:36Z,0,23138,23138,[],"[87, 221, 354, 38, 45, 169, 96, 123, 30, 430, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[87, 221, 354, 38, 45, 169, 96, 123, 30, 430, ...","[u'added', u'added', u'added', u'added', u'add...","[u'sources/scalac/ApplicationError.java', u'so...",scala,scala/scala,0,0,100982,1610,1,1,1,506,15,506,15,1610,506,0,0,0,reviewed,scala,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,81,0,Thursday,0,0,0,0,1,0,0,0,0
3,073294fbbaf752ed4f9005eb9421b7bd4f475173,- Added list of compiler source files\n\n- Add...,0,2003-02-14T10:18:21Z,0,253,253,[],"[2, 157, 94]","[0, 0, 0]","[2, 157, 94]","[u'modified', u'added', u'added']","[u'.cvsignore', u'Makefile', u'config/list/com...",scala,scala/scala,0,0,100982,1610,1,1,1,506,15,506,15,1610,506,0,0,0,reviewed,scala,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,3,0,Friday,0,0,0,0,0,1,0,0,0
4,23d2bfbeb21f63d82ed46d5b1b0b85b1ed2f4355,Initial version.,0,2003-02-14T13:36:31Z,0,859,859,[],"[12, 3, 13, 9, 35, 345, 15, 13, 6, 21, 48, 6, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 3, 13, 9, 35, 345, 15, 13, 6, 21, 48, 6, ...","[u'added', u'added', u'added', u'added', u'add...","[u'sources/scala/$colon$colon.scala', u'source...",scala,scala/scala,0,0,100982,1610,1,1,1,506,15,506,15,1610,506,0,0,0,reviewed,scala,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,22,0,Friday,0,0,0,0,1,0,0,0,0


In [32]:
df.columns

Index(['sha', 'message', 'comment_count', 'date', 'total_deletions', 'total_additions', 'total', 'comments', 'additions', 'deletions', 'changes', 'status', 'filenames', 'name', 'fullname', 'private', 'fork', 'size', 'watchers_count', 'has_issues', 'has_downloads', 'has_wiki', 'forks_count', 'open_issues_count', 'forks', 'open_issues', 'watchers', 'network_count', 'admin', 'push', 'pull', 'label', 'owner', 'language_C', 'language_C#', 'language_C++', 'language_CSS', 'language_CoffeeScript', 'language_Go', 'language_Java', 'language_JavaScript', 'language_PHP', 'language_Perl', 'language_Python', 'language_R', 'language_Ruby', 'language_Scala', 'language_Shell', 'language_TypeScript', 'type_Organization', 'type_User', 'total_files', 'results', 'date_only', 'is_weekend', 'is_night', 'outside_contribution', 'tests_included', 'most_changes_added', 'most_changes_modified', 'most_changes_removed', 'most_changes_renamed', 'most_changes_unknown'], dtype='object')

In [30]:
# save the csv
df.to_csv('../data/total_processed.csv')