# Feature Engineering
Creating features for the machine learning model

In [23]:
#imports
import numpy as np
import pandas as pd
import save as sv
import capture as cp

from tqdm import tqdm

## Target Variable 

In [2]:
path = "../data/rating/{}"

In [3]:
rating_1 = pd.read_csv(path.format("rating_1.csv"),names=['url','username','r1'])
rating_1['r1']= [float(r) for r in rating_1['r1']]
print(len(rating_1))

rating_2 = pd.read_csv(path.format("rating_2.csv"),names=['url','username','r2'])
rating_2['r2']= [float(r) for r in rating_2['r2']]
print(len(rating_2))

rating = pd.merge(rating_1, rating_2, how='inner', on=['url','username'])
rating['y'] = round((rating['r1'] + rating['r2'])/2)
print(len(rating))

rating[['r1','r2','y']].head()

231
232
230


Unnamed: 0,r1,r2,y
0,3.5,4.5,4.0
1,0.0,0.0,0.0
2,4.0,3.0,4.0
3,2.0,2.0,2.0
4,5.0,3.0,4.0


In [7]:
rating.to_csv(path.format('rating.csv'))

## Features 

0.5

In [5]:
path = "../data/html/"

In [19]:
def feat_engineer(user,path="../data/html/"):
    
    #counts
    counts = cp.get_counts(user, path) 
    feat = counts
    feat['foll_ratio'] = 0 if feat['following'] == 0 else round(feat['followers']/feat['following'],2)
    
    #languages 
    repos = cp.get_repos(user, path) 
    lang = set(repos['languages'])
    n_lang = len(lang)
    
    feat['lang'] = lang
    feat['n_lang'] = n_lang
    
    #organisations
    orgs = cp.get_orgs(user, path)
    feat['org_flag'] = 0 if len(orgs) == 0 else 1
    
    #contributions
    cont = cp.get_contributions(user, path);
    cont_values = [int(c[1]) for c in cont]

    n_cont = sum(cont_values)
    n_cont_90days = sum(cont_values[275:])
    last_cont = 0 if n_cont ==0 else next((i for i, x in enumerate(cont_values[::-1]) if x), None)
    
    feat['n_cont'] = n_cont
    feat['last_cont'] = last_cont
    feat['stab_cont'] = 0 if n_cont == 0 else round(n_cont_90days/n_cont,2)
    
    #additional features
    feat['cont_repo_ratio'] = 0 if feat['repos']  == 0 else round(n_cont/feat['repos'],2)
    
    return feat

In [25]:
features = []

for user in tqdm(rating['username']):
    feat = feat_engineer(user)
    feat['username'] = user
    features.append(feat)


  0%|          | 0/230 [00:00<?, ?it/s][A
  0%|          | 1/230 [00:00<01:04,  3.53it/s][A
  1%|          | 2/230 [00:00<00:58,  3.92it/s][A
  1%|▏         | 3/230 [00:00<01:03,  3.60it/s][A
  2%|▏         | 4/230 [00:01<01:00,  3.75it/s][A
  2%|▏         | 5/230 [00:01<01:06,  3.40it/s][A
  3%|▎         | 6/230 [00:01<01:06,  3.37it/s][A
  3%|▎         | 7/230 [00:01<01:02,  3.55it/s][A
  3%|▎         | 8/230 [00:02<00:56,  3.93it/s][A
  4%|▍         | 9/230 [00:02<00:58,  3.76it/s][A
  4%|▍         | 10/230 [00:02<00:56,  3.91it/s][A
  5%|▍         | 11/230 [00:02<00:59,  3.71it/s][A
  5%|▌         | 12/230 [00:03<01:04,  3.40it/s][A
  6%|▌         | 13/230 [00:03<00:59,  3.64it/s][A
  6%|▌         | 14/230 [00:03<01:05,  3.28it/s][A
  7%|▋         | 15/230 [00:04<01:02,  3.46it/s][A
  7%|▋         | 16/230 [00:04<00:59,  3.61it/s][A
  7%|▋         | 17/230 [00:04<00:59,  3.58it/s][A
  8%|▊         | 18/230 [00:04<00:53,  3.94it/s][A
  8%|▊         | 19/230 [00:0

 68%|██████▊   | 156/230 [00:42<00:19,  3.79it/s][A
 68%|██████▊   | 157/230 [00:42<00:19,  3.76it/s][A
 69%|██████▊   | 158/230 [00:42<00:21,  3.42it/s][A
 69%|██████▉   | 159/230 [00:42<00:19,  3.68it/s][A
 70%|██████▉   | 160/230 [00:43<00:18,  3.80it/s][A
 70%|███████   | 161/230 [00:43<00:18,  3.83it/s][A
 70%|███████   | 162/230 [00:43<00:16,  4.15it/s][A
 71%|███████   | 163/230 [00:43<00:16,  4.11it/s][A
 71%|███████▏  | 164/230 [00:44<00:17,  3.69it/s][A
 72%|███████▏  | 165/230 [00:44<00:17,  3.71it/s][A
 72%|███████▏  | 166/230 [00:44<00:16,  3.86it/s][A
 73%|███████▎  | 167/230 [00:45<00:18,  3.48it/s][A
 73%|███████▎  | 168/230 [00:45<00:17,  3.50it/s][A
 73%|███████▎  | 169/230 [00:45<00:18,  3.34it/s][A
 74%|███████▍  | 170/230 [00:45<00:16,  3.64it/s][A
 74%|███████▍  | 171/230 [00:46<00:16,  3.66it/s][A
 75%|███████▍  | 172/230 [00:46<00:16,  3.49it/s][A
 75%|███████▌  | 173/230 [00:46<00:15,  3.68it/s][A
 76%|███████▌  | 174/230 [00:47<00:15,  3.67it

In [27]:
columns = ['repos','stars','followers', 'following','foll_ratio', 
           'lang', 'n_lang','org_flag','n_cont','last_cont','stab_cont','cont_repo_ratio']
data = pd.DataFrame(features,columns=columns)

data['r1'] = rating['r1']
data['r2'] = rating['r2']
data['y'] = rating['y']

In [28]:
data

Unnamed: 0,repos,stars,followers,following,foll_ratio,lang,n_lang,org_flag,n_cont,last_cont,stab_cont,cont_repo_ratio,r1,r2,y
0,71,306,106,9,11.78,"{Rust, Ruby, Shell, JavaScript, HTML, PHP, Cof...",10,1,437,0,0.25,6,3.5,4.5,4.0
1,0,0,0,1,0.0,{},0,0,1,159,0.0,0,0.0,0.0,0.0
2,12,1,182,0,0.0,"{Ruby, Shell, JavaScript, PHP, CSS}",5,0,433,3,0.23,36,4.0,3.0,4.0
3,13,31,0,17,0.0,"{JavaScript, HTML, PHP}",3,0,20,0,0.5,2,2.0,2.0,2.0
4,99,344,370,16,23.12,"{Jupyter Notebook, Shell, JavaScript, HTML, Do...",10,0,255,6,0.02,3,5.0,3.0,4.0
5,264,377,374,212,1.76,"{TeX, HTML, Jupyter Notebook, Python}",4,1,1194,2,0.25,5,4.0,4.5,4.0
6,21,110,18,5,3.6,"{TypeScript, JavaScript, HTML, PHP, Go, Clojur...",7,0,107,1,0.36,5,3.0,2.5,3.0
7,1,0,0,1,0.0,{},0,0,2,107,0.0,2,0.0,0.0,0.0
8,13,730,17,73,0.23,"{ColdFusion, JavaScript, Java, C++, CSS}",5,0,9,90,0.0,1,2.0,1.0,2.0
9,14,9,2400,1,2400.0,"{HTML, PHP, CSS}",3,0,0,0,0.0,0,3.0,2.0,2.0


In [29]:
data.to_csv('../data/gitrater.csv')

In [30]:
pd.set_option('display.max_rows', 300)
data

Unnamed: 0,repos,stars,followers,following,foll_ratio,lang,n_lang,org_flag,n_cont,last_cont,stab_cont,cont_repo_ratio,r1,r2,y
0,71,306,106,9,11.78,"{Rust, Ruby, Shell, JavaScript, HTML, PHP, Cof...",10,1,437,0,0.25,6,3.5,4.5,4.0
1,0,0,0,1,0.0,{},0,0,1,159,0.0,0,0.0,0.0,0.0
2,12,1,182,0,0.0,"{Ruby, Shell, JavaScript, PHP, CSS}",5,0,433,3,0.23,36,4.0,3.0,4.0
3,13,31,0,17,0.0,"{JavaScript, HTML, PHP}",3,0,20,0,0.5,2,2.0,2.0,2.0
4,99,344,370,16,23.12,"{Jupyter Notebook, Shell, JavaScript, HTML, Do...",10,0,255,6,0.02,3,5.0,3.0,4.0
5,264,377,374,212,1.76,"{TeX, HTML, Jupyter Notebook, Python}",4,1,1194,2,0.25,5,4.0,4.5,4.0
6,21,110,18,5,3.6,"{TypeScript, JavaScript, HTML, PHP, Go, Clojur...",7,0,107,1,0.36,5,3.0,2.5,3.0
7,1,0,0,1,0.0,{},0,0,2,107,0.0,2,0.0,0.0,0.0
8,13,730,17,73,0.23,"{ColdFusion, JavaScript, Java, C++, CSS}",5,0,9,90,0.0,1,2.0,1.0,2.0
9,14,9,2400,1,2400.0,"{HTML, PHP, CSS}",3,0,0,0,0.0,0,3.0,2.0,2.0
