# Data Exploration Github Data

In [4]:
# import libraries
import pandas as pd
import numpy as np

In [5]:
# read the data
df = pd.read_csv("../data/total.csv")

In [6]:
# checking the data shape
df.shape

(601077, 35)

### One hot-encoding

In [7]:
# import preprocessing from sklearn
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [8]:
# encoding these columns that are 0/1
df['private'] = le.fit_transform(df.private.values)
df['fork'] = le.fit_transform(df.fork.values)
df['has_issues'] = le.fit_transform(df.has_issues.values)
df['has_downloads'] = le.fit_transform(df.has_downloads.values)
df['has_wiki'] = le.fit_transform(df.has_wiki.values)
df['admin'] = le.fit_transform(df.admin.values)
df['push'] = le.fit_transform(df.push.values)
df['pull'] = le.fit_transform(df.pull.values)
df['type'] = le.fit_transform(df.type.values)

In [9]:
# encoding the language categorical feature
df = pd.get_dummies(df, columns = ["language"])

In [10]:
df.shape

(601077, 50)

### Feature Creation

In [11]:
# create a new column called total_files that sums up the filenames column
df['total_files'] = df.apply(lambda row: row.filenames.count(',') + 1, axis=1)

In [12]:
# use that paper to classify the commits as being defective of not
df['results'] = np.where(df['message'].str.lower().str.contains("bug|fix|error|wrong|fail|problem|patch"), 
                         1, 
                         0)

#### Delete null date column and date that is equal to 1

In [13]:
df = df[pd.notnull(df['date'])]
df = df.drop(df[(df.date == '1')].index)

#### Order by the date

In [14]:
df = df.sort_values('date')

### Find the Weekend

In [15]:
import datetime
import re

def transformStringDate(strDate):
    match = re.search(r'\d{4}-\d{2}-\d{2}', strDate)
    date = datetime.datetime.strptime(match.group(), '%Y-%m-%d').date()
    return date.strftime('%A')

In [16]:
df['dateonly'] = df['date'].apply(transformStringDate)

In [17]:
df.dateonly.unique()

array(['Thursday', 'Friday', 'Saturday', 'Monday', 'Wednesday', 'Tuesday',
       'Sunday'], dtype=object)

In [18]:
def weekendcheck(s):
    if s == 'Saturday' or s == 'Sunday': 
        return 1
    else:
        return 0

df['isweekend'] = df['dateonly'].apply(weekendcheck)

### Find the night

In [19]:
def isnight(datetimeobject):
    match = re.search('\d{2}:\d{2}:\d{2}', datetimeobject).group(0)
    first_two_letters = match[:2]
    if int(first_two_letters) > 20 or int(first_two_letters) < 6:
        return(1)
    else:
        return(0)

In [20]:
df['isnight'] = df['date'].apply(isnight)

In [21]:
df.isnight.value_counts()

0    399121
1    201956
Name: isnight, dtype: int64

In [22]:
# save the csv
df.to_csv('../data/total_processed.csv')