# Tranform the Github Data  

#### Read the data

In [8]:
# import the libraries
import pandas as pd
import numpy as np
import sys

In [9]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [10]:
# read the data
df = pd.read_csv("../data/total_processed.csv", index_col=0)

In [11]:
df.shape

(601078, 66)

### Mean of the commit label

In [12]:
def mean(items):
    index = 0
    result = []
    value = 0

    for item in items:
        index = index + 1

        if item == 1:
            value = value + 1

        result.append((value/index)*100)

    return result

In [13]:
# create a dictionary project_name + mean_values for the commit classification
projects = df['name'].unique()
commit_mean_dict = {}

for project in projects:
    commits = df[df.name==project].results
    mean_commits = mean(commits)
    commit_mean_dict[project] = mean_commits

In [14]:
# replace all the data
df['mean_commits'] = 0

for project in projects:
    df.loc[df['name'] == project, 'mean_commits'] = commit_mean_dict[project]

### Variance of the column results

In [15]:
for project in projects:
    df.loc[df['name'] == project, 'variance_commits'] = df[df['name']==project][['results', 'mean_commits']].var(ddof=0, axis=1)

### Set the phases of the project

In [16]:
df = df.drop(df[df.sha == 'f5d7eb5b623b625062cf0d3d8d552ee0ea9000dd'].index)

In [17]:
def phases(entire_data):
    len_project = len(entire_data)
    phases = []
    
    end_phase_1 = len_project / 3
    end_phase_1 = int(end_phase_1)
    end_phase_2 = end_phase_1 + end_phase_1
    end_phase_2 = int(end_phase_2)

    i = 0

    for item in entire_data:
        if i < end_phase_1:
            phases.append(1)
            i = i + 1
        elif i >= end_phase_1 and i < end_phase_2:
            phases.append(2)
            i = i + 1
        else:
            phases.append(3)
            i = i + 1
        
    return phases

In [18]:
# create a dictionary project_name + phases for the commit classification
commit_phases_dict = {}

for project in projects:
    commits = df[df.name==project].results
    phases_commits = phases(commits)
    commit_phases_dict[project] = phases_commits

In [19]:
# replace all the data
df['phases_project'] = 0

for project in projects:
    df.loc[df['name'] == project, 'phases_project'] = commit_phases_dict[project]

In [20]:
df = pd.get_dummies(df, columns = ["phases_project"])

In [21]:
#df[df['name']=='django']

#### Fill NAN values that can be filled with 0

In [22]:
df['comment_count'] = df['comment_count'].fillna(0)

#### Send the mean_commits to the end

In [23]:
# test the mean_commits as a regression problem
results = df.mean_commits.values
df = df.drop("mean_commits", axis=1)
df['mean_commits'] = results 

In [24]:
# save the csv
df.to_csv('../data/total_transformed.csv')