# Feature Engineering
Creating features for the ML models from scrapped data 

In [2]:
#imports
import numpy as np
import pandas as pd
from tqdm import tqdm


import save as sv
import capture as cp

## Target Variable 

In [3]:
path = "../data/rating/{}"

In [5]:
rating_1 = pd.read_csv(path.format("rating_1.csv"),names=['url','username','r1'])
rating_1['r1']= [float(r) for r in rating_1['r1']]
print(len(rating_1))

rating_2 = pd.read_csv(path.format("rating_2.csv"),names=['url','username','r2'])
rating_2['r2']= [float(r) for r in rating_2['r2']]
print(len(rating_2))

rating = pd.merge(rating_1, rating_2, how='inner', on=['url','username'])
rating['y'] = (rating['r1'] + rating['r2'])/2
print(len(rating))

rating[['r1','r2','y']].head()

231
232
230


Unnamed: 0,r1,r2,y
0,3.5,4.5,4.0
1,0.0,0.0,0.0
2,4.0,3.0,3.5
3,2.0,2.0,2.0
4,5.0,3.0,4.0


In [7]:
rating.to_csv(path.format('rating.csv'))

## Features 

In [6]:
path = "../data/html/"

In [15]:
def feat_engineer(user,path="../data/html/"):
    
    #counts
    counts = cp.get_counts(user, path) 
    feat = counts
    feat['foll_ratio'] = -1 if feat['following'] == 0 else round(feat['followers']/feat['following'],2)
    
    #languages 
    repos = cp.get_repos(user, path) 
    lang = list(set(repos['languages']))
    n_lang = len(lang)
    
    feat['lang'] = lang
    feat['n_lang'] = n_lang
    
    #organisations
    orgs = cp.get_orgs(user, path)
    feat['org_flag'] = 0 if len(orgs) == 0 else 1
    
    #contributions
    cont = cp.get_contributions(user, path);
    cont_values = [int(c[1]) for c in cont]

    n_cont = sum(cont_values)
    n_cont_90days = sum(cont_values[275:])
    last_cont = 0 if n_cont ==0 else next((i for i, x in enumerate(cont_values[::-1]) if x), None)
    
    feat['n_cont'] = n_cont
    feat['last_cont'] = last_cont
    feat['stab_cont'] = 0 if n_cont == 0 else round(n_cont_90days/n_cont,2)
    
    #additional features
    feat['cont_repo_ratio'] = 0 if feat['repos']  == 0 else round(n_cont/feat['repos'],2)
    
    return feat

In [16]:
features = []

for user in tqdm(rating['username']):
    feat = feat_engineer(user)
    feat['username'] = user
    features.append(feat)

100%|██████████| 230/230 [00:57<00:00,  4.00it/s]


In [20]:
columns = ['repos','stars','followers', 'following','foll_ratio', 
           'lang', 'n_lang','org_flag','n_cont','last_cont','stab_cont','cont_repo_ratio']
data = pd.DataFrame(features,columns=columns)

data['r1'] = rating['r1']
data['r2'] = rating['r2']
data['y'] = rating['y']

data.head()

Unnamed: 0,repos,stars,followers,following,foll_ratio,lang,n_lang,org_flag,n_cont,last_cont,stab_cont,cont_repo_ratio,r1,r2,y
0,71,306,106,9,11.78,"[JavaScript, PHP, CSS, Ruby, Shell, HTML, Rust...",10,1,437,0,0.25,6.15,3.5,4.5,4.0
1,0,0,0,1,0.0,[],0,0,1,159,0.0,0.0,0.0,0.0,0.0
2,12,1,182,0,-1.0,"[JavaScript, PHP, CSS, Ruby, Shell]",5,0,433,3,0.23,36.08,4.0,3.0,3.5
3,13,31,0,17,0.0,"[PHP, HTML, JavaScript]",3,0,20,0,0.5,1.54,2.0,2.0,2.0
4,99,344,370,16,23.12,"[C, JavaScript, Go, CMake, Jupyter Notebook, D...",10,0,255,6,0.02,2.58,5.0,3.0,4.0


In [19]:
data.to_csv('../data/gitrater.csv')

# Archive

In [52]:
data.r1 = (data.r1-np.mean(data.r1))/ np.std(data.r1)
data.r2 = (data.r2-np.mean(data.r2))/ np.std(data.r2)

In [53]:
unique_lang = list(set([l for lang in data['lang'] for l in lang]))

count = []
r1_avg = []
r2_avg = []
y_avg = []
for lang in unique_lang:
    rows = data[[lang in row for row in data.lang]]
    count.append(len(rows))
    r1_avg.append(np.mean(rows.r1))
    r2_avg.append(np.mean(rows.r2))
    y_avg.append(np.mean(rows.y))

In [54]:
pd.DataFrame({'lang':unique_lang, 'count':count, 'r1_avg':r1_avg, 'r2_avg':r2_avg,'y_avg':y_avg  })

Unnamed: 0,lang,count,r1_avg,r2_avg,y_avg
0,Smali,1,-0.006106,-0.090485,2.250000
1,Verilog,1,0.344982,-0.426154,2.250000
2,Crystal,1,-1.059370,-0.090485,1.500000
3,TeX,14,0.269749,0.245185,2.696429
4,JavaScript,132,0.198696,0.209583,2.619318
5,Go,40,0.274765,0.362669,2.787500
6,C#,32,-0.039020,-0.079995,2.234375
7,Scheme,3,-0.006106,-0.090485,2.250000
8,Lua,11,0.025811,0.031577,2.363636
9,Objective-C,10,-0.497629,-0.325453,1.725000
