# Feature Engineering
Creating features for the machine learning model

In [1]:
#imports
import numpy as np
import pandas as pd
import save as sv
import capture as cp

from tqdm import tqdm

## Target Variable 

In [2]:
path = "../data/rating/{}"

In [3]:
rating_1 = pd.read_csv(path.format("rating_1.csv"),names=['url','username','r1'])
rating_1['r1']= [float(r) for r in rating_1['r1']]
print(len(rating_1))

rating_2 = pd.read_csv(path.format("rating_2.csv"),names=['url','username','r2'])
rating_2['r2']= [float(r) for r in rating_2['r2']]
print(len(rating_2))

rating = pd.merge(rating_1, rating_2, how='inner', on=['url','username'])
rating['y'] = round((rating['r1'] + rating['r2'])/2)
print(len(rating))

rating[['r1','r2','y']].head()

231
232
230


Unnamed: 0,r1,r2,y
0,3.5,4.5,4.0
1,0.0,0.0,0.0
2,4.0,3.0,4.0
3,2.0,2.0,2.0
4,5.0,3.0,4.0


In [7]:
rating.to_csv(path.format('rating.csv'))

## Features 

In [5]:
path = "../data/html/"

In [7]:
def feat_engineer(user,path="../data/html/"):
    
    #counts
    counts = cp.get_counts(user, path) 
    feat = counts
    feat['foll_ratio'] = 0 if feat['following'] == 0 else round(feat['followers']/feat['following'])
    
    #languages 
    repos = cp.get_repos(user, path) 
    lang = set(repos['languages'])
    n_lang = len(lang)
    
    feat['lang'] = lang
    feat['n_lang'] = n_lang
    
    #organisations
    orgs = cp.get_orgs(user, path)
    feat['org_flag'] = 0 if len(orgs) == 0 else 1
    
    #contributions
    cont = cp.get_contributions(user, path);
    cont_values = [int(c[1]) for c in cont]

    n_cont = sum(cont_values)
    n_cont_90days = sum(cont_values[275:])
    last_cont = 0 if n_cont ==0 else next((i for i, x in enumerate(cont_values[::-1]) if x), None)
    
    feat['n_cont'] = n_cont
    feat['last_cont'] = last_cont
    feat['stab_cont'] = 0 if n_cont == 0 else round(n_cont_90days/n_cont,2)
    
    #additional features
    feat['cont_repo_ratio'] = 0 if feat['repos']  == 0 else round(n_cont/feat['repos'])
    
    return feat

In [8]:
features = []

for user in tqdm(rating['username']):
    feat = feat_engineer(user)
    feat['username'] = user
    features.append(feat)

100%|██████████| 230/230 [01:03<00:00,  3.80it/s]


In [9]:
columns = ['repos','stars','followers', 'following','foll_ratio', 
           'lang', 'n_lang','org_flag','n_cont','last_cont','stab_cont','cont_repo_ratio']
data = pd.DataFrame(features,columns=columns)

data['r1'] = rating['r1']
data['r2'] = rating['r2']
data['y'] = rating['y']

In [14]:
data.to_csv('../data/gitrater.csv')

In [10]:
pd.set_option('display.max_rows', 300)
data

Unnamed: 0,repos,stars,followers,following,foll_ratio,lang,n_lang,org_flag,n_cont,last_cont,stab_cont,cont_repo_ratio,r1,r2,y
0,71,306,106,9,12,"{C, PHP, CSS, HTML, Makefile, Shell, JavaScrip...",10,1,437,0,0.25,6,3.5,4.5,4.0
1,0,0,0,1,0,{},0,0,1,159,0.0,0,0.0,0.0,0.0
2,12,1,182,0,0,"{PHP, CSS, Shell, JavaScript, Ruby}",5,0,433,3,0.23,36,4.0,3.0,4.0
3,13,31,0,17,0,"{HTML, JavaScript, PHP}",3,0,20,0,0.5,2,2.0,2.0,2.0
4,99,344,370,16,23,"{C, Jupyter Notebook, Dockerfile, HTML, C++, S...",10,0,255,6,0.02,3,5.0,3.0,4.0
5,264,377,374,212,2,"{TeX, Jupyter Notebook, HTML, Python}",4,1,1194,2,0.25,5,4.0,4.5,4.0
6,21,110,18,5,4,"{PHP, TypeScript, Clojure, HTML, Dockerfile, J...",7,0,107,1,0.36,5,3.0,2.5,3.0
7,1,0,0,1,0,{},0,0,2,107,0.0,2,0.0,0.0,0.0
8,13,730,17,73,0,"{Java, CSS, ColdFusion, C++, JavaScript}",5,0,9,90,0.0,1,2.0,1.0,2.0
9,14,9,2400,1,2400,"{PHP, HTML, CSS}",3,0,0,0,0.0,0,3.0,2.0,2.0
