In [1]:
import pandas as pd

df_god = pd.read_csv('combined_github_data.csv')

In [2]:
df_god.drop('Unnamed: 0', axis=1)

Unnamed: 0,id_1,id_2,name,ml_target,name_2,ml_target_2
0,0,23977,Eiryyy,0,airtoxin,0
1,1,34526,shawflying,0,ghosind,0
2,1,2370,shawflying,0,jasondu,0
3,1,14683,shawflying,0,chaoslawful,0
4,1,29982,shawflying,0,dead-horse,0
...,...,...,...,...,...,...
288998,37527,37596,SusmoyBarman1,1,rusenask,0
288999,37529,37601,brannondorsey,0,khacluan,0
289000,37644,2347,shriphani,0,bamos,0
289001,25879,2347,jovanidash21,0,bamos,0


In [3]:
df_wd = df_god.loc[(df_god['ml_target'] == 0) & (df_god['ml_target_2'] == 0)]
df_wd = df_wd.drop('Unnamed: 0', axis=1)
df_wd = df_wd.reset_index(drop=True)
df_wd

Unnamed: 0,id_1,id_2,name,ml_target,name_2,ml_target_2
0,0,23977,Eiryyy,0,airtoxin,0
1,1,34526,shawflying,0,ghosind,0
2,1,2370,shawflying,0,jasondu,0
3,1,14683,shawflying,0,chaoslawful,0
4,1,29982,shawflying,0,dead-horse,0
...,...,...,...,...,...,...
224618,31255,37602,simone-sanfratello,0,motocarota,0
224619,19093,2347,greed2411,0,bamos,0
224620,37529,37601,brannondorsey,0,khacluan,0
224621,37644,2347,shriphani,0,bamos,0


In [4]:
df_ml = df_god.loc[(df_god['ml_target'] == 1) & (df_god['ml_target_2'] == 1)]
df_ml = df_ml.drop('Unnamed: 0', axis=1)
df_ml = df_ml.reset_index(drop=True)
df_ml

Unnamed: 0,id_1,id_2,name,ml_target,name_2,ml_target_2
0,6067,20183,danalex97,1,panpan2,1
1,4,9342,sunilangadi2,1,KnightBaron,1
2,32,4293,city292,1,ilearnProgramme,1
3,32,22351,city292,1,QigenLin,1
4,33,5398,riverphoenix,1,danilosoba,1
...,...,...,...,...,...,...
19679,37223,37289,gok03,1,amueller,1
19680,37276,37289,inejc,1,amueller,1
19681,37289,25616,amueller,1,shatu,1
19682,37289,37364,amueller,1,acomets,1


In [5]:
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
import warnings; warnings.simplefilter('ignore')

## 3 Metrics for Machine Learning

In [6]:
ML_symmetric = nx.Graph()

In [7]:
for i in range(len(df_ml)):
    ML_symmetric.add_edge(df_ml.iloc[i]['name'], df_ml.iloc[i]['name_2'])

In [8]:
%%time
degree_ML = nx.degree_centrality(ML_symmetric)

Wall time: 3.99 ms


In [9]:
%%time
closeness_ML = nx.closeness_centrality(ML_symmetric)

Wall time: 3min 27s


In [10]:
%%time
betweeness_ML = nx.betweenness_centrality(ML_symmetric)

Wall time: 4min 29s


In [11]:
df_degree_ML = pd.DataFrame(degree_ML.items())
df_betweeness_ML = pd.DataFrame(betweeness_ML.items())
df_closeness_ML = pd.DataFrame(closeness_ML.items())

In [12]:
df_degree_ML.head()

Unnamed: 0,0,1
0,danalex97,0.000135
1,panpan2,0.000135
2,sunilangadi2,0.000135
3,KnightBaron,0.000135
4,city292,0.000269


In [13]:
df_betweeness_ML.head()

Unnamed: 0,0,1
0,danalex97,0.0
1,panpan2,0.0
2,sunilangadi2,0.0
3,KnightBaron,0.0
4,city292,4e-06


In [14]:
df_closeness_ML.head()

Unnamed: 0,0,1
0,danalex97,0.000135
1,panpan2,0.000135
2,sunilangadi2,0.000135
3,KnightBaron,0.000135
4,city292,0.204604


In [15]:
df_new_ML = df_degree_ML.merge(df_betweeness_ML, left_on=0, right_on=0)
df_result_ML = df_new_ML.merge(df_closeness_ML, left_on=0, right_on=0)

In [16]:
df_result_ML

Unnamed: 0,0,1_x,1_y,1
0,danalex97,0.000135,0.000000,0.000135
1,panpan2,0.000135,0.000000,0.000135
2,sunilangadi2,0.000135,0.000000,0.000135
3,KnightBaron,0.000135,0.000000,0.000135
4,city292,0.000269,0.000004,0.204604
...,...,...,...,...
7426,NishikinoKKi,0.000269,0.000513,0.182965
7427,ABadCandy,0.000135,0.000000,0.189084
7428,jinminhao,0.000135,0.000000,0.228421
7429,imranashraf,0.000135,0.000000,0.184273


In [17]:
df_result_ML = df_result_ML.rename(columns={0:'Account_name','1_x': 'Degree','1_y': 'betweenness',1: 'closeness'})

In [18]:
df_result_ML.head()

Unnamed: 0,Account_name,Degree,betweenness,closeness
0,danalex97,0.000135,0.0,0.000135
1,panpan2,0.000135,0.0,0.000135
2,sunilangadi2,0.000135,0.0,0.000135
3,KnightBaron,0.000135,0.0,0.000135
4,city292,0.000269,4e-06,0.204604


## 3 Metrics for Web Developers

In [19]:
WD_symmetric = nx.Graph()

In [20]:
for i in range(len(df_wd)):
    WD_symmetric.add_edge(df_wd.iloc[i]['name'], df_wd.iloc[i]['name_2'])

In [21]:
%%time
degree_WD = nx.degree_centrality(WD_symmetric)

Wall time: 21.9 ms


In [22]:
%%time
closeness_WD = nx.closeness_centrality(WD_symmetric)

Wall time: 1h 57min 36s


In [23]:
%%time
betweeness_WD = nx.betweenness_centrality(WD_symmetric)

Wall time: 2h 31min 49s


In [24]:
df_degree_WD = pd.DataFrame(degree_WD.items())
df_betweeness_WD = pd.DataFrame(betweeness_WD.items())
df_closeness_WD = pd.DataFrame(closeness_WD.items())

In [25]:
df_new_WD = df_degree_WD.merge(df_betweeness_WD, left_on=0, right_on=0)
df_result_WD = df_new_WD.merge(df_closeness_WD, left_on=0, right_on=0)

In [26]:
df_result_WD

Unnamed: 0,0,1_x,1_y,1
0,Eiryyy,0.000036,0.000000,0.280176
1,airtoxin,0.001084,0.000198,0.389349
2,shawflying,0.000289,0.000073,0.302973
3,ghosind,0.000036,0.000000,0.232481
4,jasondu,0.001445,0.000033,0.384014
...,...,...,...,...
27671,Vasile2k,0.000036,0.000000,0.311267
27672,SPGoding,0.000036,0.000000,0.266209
27673,chrisryancarter,0.000036,0.000000,0.241242
27674,chadmazilly,0.000036,0.000000,0.272011


In [27]:
df_result_WD = df_result_WD.rename(columns={0:'Account_name','1_x': 'Degree','1_y': 'betweenness',1: 'closeness'})

In [28]:
df_result_WD.head()

Unnamed: 0,Account_name,Degree,betweenness,closeness
0,Eiryyy,3.6e-05,0.0,0.280176
1,airtoxin,0.001084,0.000198,0.389349
2,shawflying,0.000289,7.3e-05,0.302973
3,ghosind,3.6e-05,0.0,0.232481
4,jasondu,0.001445,3.3e-05,0.384014


## Calculate centrality

In [29]:
df_result_ML['Centrality']=df_result_ML['Degree']+df_result_ML['betweenness']+df_result_ML['closeness']
df_result_ML = df_result_ML.sort_values(by='Centrality', inplace=True, ascending=False)
df_result_ML.head()

Unnamed: 0,Account_name,Degree,betweenness,closeness,Centrality
0,danalex97,0.000135,0.0,0.000135,0.000269
4860,amosbird,0.000135,0.0,0.000135,0.000269
4818,drummyfish,0.000135,0.0,0.000135,0.000269
4817,Romop5,0.000135,0.0,0.000135,0.000269
4762,dreamInCoDeforlife,0.000135,0.0,0.000135,0.000269


In [30]:
df_result_ML.to_csv('df_result_ML.csv')

In [31]:
df_result_WD['Centrality']=df_result_WD['Degree']+df_result_WD['betweenness']+df_result_WD['closeness']
df_result_WD.sort_values(by='Centrality', inplace=True, ascsending=False)
df_result_WD.head()

Unnamed: 0,Account_name,Degree,betweenness,closeness,Centrality
27530,chahatagarwal,3.6e-05,0.0,3.6e-05,7.2e-05
23749,Nava2,3.6e-05,0.0,3.6e-05,7.2e-05
23748,JonDemelo,3.6e-05,0.0,3.6e-05,7.2e-05
26933,csxiaoyaojianxian,3.6e-05,0.0,3.6e-05,7.2e-05
26934,longIvan,3.6e-05,0.0,3.6e-05,7.2e-05


In [32]:
df_result_WD.to_csv('df_result_WD.csv')