# Feature Engineering Github Data

In [16]:
# import libraries
import pandas as pd
import numpy as np

In [17]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [18]:
# read the data
#df = pd.read_csv("../data/total.csv")
df = pd.read_csv("../data/raw_data.csv")

In [19]:
# checking the data shape
df.shape

(601077, 38)

### One hot-encoding

In [20]:
# import preprocessing from sklearn
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [21]:
df.language.value_counts()

Ruby            131266
PHP              77864
C                73122
C#               70584
C++              67798
Python           52618
Scala            44034
JavaScript       41450
Java             29195
TypeScript        4904
R                 4313
CSS               2910
Go                 905
CoffeeScript        76
Perl                32
Shell                2
Name: language, dtype: int64

In [22]:
# encoding these columns that are 0/1
df['private'] = le.fit_transform(df.private.values)
df['fork'] = le.fit_transform(df.fork.values)
df['has_issues'] = le.fit_transform(df.has_issues.values)
df['has_downloads'] = le.fit_transform(df.has_downloads.values)
df['has_wiki'] = le.fit_transform(df.has_wiki.values)
df['admin'] = le.fit_transform(df.admin.values)
df['push'] = le.fit_transform(df.push.values)
df['pull'] = le.fit_transform(df.pull.values)

In [23]:
# encoding the language categorical feature
#df = pd.get_dummies(df, columns = ["language", "label", "type"])
df = pd.get_dummies(df, columns = ["language", "type"])

In [24]:
df.shape

(601077, 54)

### Total Files

In [25]:
# create a new column called total_files that sums up the filenames column
df['total_files'] = df.apply(lambda row: row.filenames.count(',') + 1, axis=1)

### Y Creation

In [26]:
# use that paper to classify the commits as being defective of not
df['results'] = np.where(df['message'].str.lower().str.contains("bug|fix|error|wrong|fail|problem|patch"), 
                         1, 
                         0)

#### Delete null date column and date that is equal to 1

In [27]:
df = df[pd.notnull(df['date'])]
df = df.drop(df[(df.date == '1')].index)

### Order by the name and date

In [28]:
df = df.sort_values('name')
df = df.sort_values('date')

#### Reset the Index

In [29]:
df = df.reset_index(drop=True)

### Find the Weekend

In [30]:
import datetime
import re

def transformStringDate(strDate):
    match = re.search(r'\d{4}-\d{2}-\d{2}', strDate)
    date = datetime.datetime.strptime(match.group(), '%Y-%m-%d').date()
    return date.strftime('%A')

In [31]:
df['date_only'] = df['date'].apply(transformStringDate)

In [32]:
def weekendcheck(s):
    if s == 'Saturday' or s == 'Sunday': 
        return 1
    else:
        return 0

df['is_weekend'] = df['date_only'].apply(weekendcheck)

### Find the night

In [33]:
def isnight(datetimeobject):
    match = re.search('\d{2}:\d{2}:\d{2}', datetimeobject).group(0)
    first_two_letters = match[:2]
    if int(first_two_letters) > 20 or int(first_two_letters) < 6:
        return(1)
    else:
        return(0)

In [34]:
df['is_night'] = df['date'].apply(isnight)

### Extract the Owner

In [35]:
def checkOwner(fullname):
    data = fullname.split("/")
    
    if(data[0] == data[1]):
        return(0)
    else:
        return(1)

In [36]:
df['outside_contribution'] = df['fullname'].apply(checkOwner)
#df['outside_contribution'] = df.apply(lambda x: checkOwner(x.fullname, x.owner), axis=1)

### Check for Tests

In [37]:
def checkTests(filesCommitted): 
    #for names in filesCommitted:
    if (filesCommitted.find('test') != -1 or filesCommitted.find('Test') != -1): 
        return 1 

    return 0

In [38]:
df['tests_included'] = df['filenames'].apply(checkTests)

In [39]:
df.tests_included.value_counts()

0    454950
1    146127
Name: tests_included, dtype: int64

### Check the Changes

In [40]:
from collections import Counter

def checkChanges(fileStatus): 
    list_files = fileStatus.split(",")
    occurence = Counter(list_files).most_common(1)
    tupleOccurences = occurence[0]
    if (tupleOccurences[0].find('modified') != -1): 
        return 'modified'
    elif (tupleOccurences[0].find('added') != -1): 
        return 'added'
    elif (tupleOccurences[0].find('removed') != -1): 
        return 'removed'
    elif (tupleOccurences[0].find('renamed') != -1): 
        return 'renamed'
    else:
        return 'unknown'

In [41]:
df['most_changes'] = df['status'].apply(checkChanges)

In [42]:
df.most_changes.unique()

array(['added', 'modified', 'removed', 'renamed', 'unknown'], dtype=object)

In [43]:
# encoding the language categorical feature
df = pd.get_dummies(df, columns = ["most_changes"])

In [44]:
#df.head(5)

In [45]:
df.columns

Index(['sha', 'message', 'comment_count', 'date', 'total_deletions', 'total_additions', 'total', 'comments', 'additions', 'deletions', 'changes', 'status', 'filenames', 'name', 'fullname', 'private', 'fork', 'size', 'watchers_count', 'has_issues', 'has_downloads', 'has_wiki', 'forks_count', 'open_issues_count', 'forks', 'open_issues', 'watchers', 'network_count', 'admin', 'push', 'pull', 'label', 'owner', 'followers', 'following', 'public_gists', 'language_C', 'language_C#', 'language_C++', 'language_CSS', 'language_CoffeeScript', 'language_Go', 'language_Java', 'language_JavaScript', 'language_PHP', 'language_Perl', 'language_Python', 'language_R', 'language_Ruby', 'language_Scala', 'language_Shell', 'language_TypeScript', 'type_Organization', 'type_User', 'total_files', 'results', 'date_only', 'is_weekend', 'is_night', 'outside_contribution', 'tests_included', 'most_changes_added', 'most_changes_modified', 'most_changes_removed', 'most_changes_renamed', 'most_changes_unknown'], dtype

In [30]:
# save the csv
df.to_csv('../data/total_processed.csv')