In [94]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import string
import datetime

In [95]:
df = pd.read_csv('../data/math_geneology_final.csv')
#df = df.head(500)
df = df.fillna(' ')

In [100]:
for col in ['student_school','advisor_school']:
    df[col] = [x.strip() for x in df[col]]

# Nodes

In [110]:
schools_1 = (df
             .groupby('student_school')
             .agg({'student_id':pd.Series.nunique})
             .reset_index()
             .rename({'student_school':'school',
                      'student_id':'id'},axis=1))
schools_2 = (df.groupby('advisor_school')
             .agg({'advisor_name':pd.Series.nunique})
             .reset_index()
             .rename({'advisor_school':'school',
                      'advisor_name':'id'},axis=1))
df_schools = pd.concat([schools_1,schools_2],axis=0,sort=False)

In [111]:
df_schools = df_schools.loc[df_schools.school != '',:].reset_index(drop=True)
df_nodes = df_schools.groupby('school').agg({'id':'sum'}).reset_index()

In [112]:
ids = df_nodes['school']
nodes_dict = [{'id':df_nodes.loc[i,'id']} for i in range(df_nodes.shape[0])]
nodes = [x for x in zip(ids,nodes_dict)]

# Edges

In [152]:
df_edges = df.groupby(['advisor_school','student_school']).agg({'id':'count'}).reset_index()
df_edges['same_school'] = df_edges['advisor_school'] == df_edges['student_school']
df_edges = df_edges.loc[(df_edges.student_school != '') & 
                        (df_edges.advisor_school != '') &
                        (df_edges.same_school == False),:].reset_index(drop=True)

In [153]:
edge_dict = [{'count':df_edges.loc[i,'id']} for i in range(df_edges.shape[0])]

In [154]:
edges = [x for x in zip(df_edges['advisor_school'],df_edges['student_school'],edge_dict)]

In [156]:
len(df_edges.loc[df_edges.advisor_school.str.contains('New York Univ'),:]['id'])

273

# Add to Graph

In [160]:
G = nx.DiGraph()

In [161]:
G.add_nodes_from(nodes)
G.add_edges_from(edges)

# Analysis

In [188]:
# Degree Centrality

cent = nx.degree_centrality(G)
df_cent = pd.DataFrame(pd.Series(cent))
df_cent.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
"University of California, Berkeley",0.144882
Lomonosov Moscow State University,0.134959
Massachusetts Institute of Technology,0.131273
Université Pierre-et-Marie-Curie - Paris VI,0.122767
University of Cambridge,0.119081
University of Oxford,0.118514
Stanford University,0.115963
Princeton University,0.115112
University of Illinois at Urbana-Champaign,0.104621
University of Wisconsin-Madison,0.104621


In [189]:
# Out-Degree Centrality

cent = nx.out_degree_centrality(G)
df_cent = pd.DataFrame(pd.Series(cent))
df_cent.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
Lomonosov Moscow State University,0.12135
"University of California, Berkeley",0.114828
Massachusetts Institute of Technology,0.096399
Université Pierre-et-Marie-Curie - Paris VI,0.095832
University of Cambridge,0.094982
University of Oxford,0.088177
Princeton University,0.087893
Stanford University,0.087326
Harvard University,0.080805
University of Wisconsin-Madison,0.080238


In [190]:
# In-Degree Centrality

cent = nx.in_degree_centrality(G)
df_cent = pd.DataFrame(pd.Series(cent))
df_cent.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
ETH Zürich,0.046215
Massachusetts Institute of Technology,0.034874
University of Illinois at Urbana-Champaign,0.034307
Purdue University,0.03374
University of Michigan,0.032889
The Pennsylvania State University,0.030904
University of Oxford,0.030337
University of Texas at Austin,0.030054
"University of California, Berkeley",0.030054
Technische Universität Berlin,0.030054


In [193]:
test = G.degree(weight='id')

In [187]:
# Eigenvector Centrality

cent = nx.eigenvector_centrality(G)
df_cent = pd.DataFrame(pd.Series(cent))
df_cent.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
University of Illinois at Urbana-Champaign,0.114627
Purdue University,0.112173
ETH Zürich,0.111318
University of Michigan,0.109663
Texas A&M University,0.109458
University of Texas at Austin,0.103415
Massachusetts Institute of Technology,0.100572
The Pennsylvania State University,0.10015
Georgia Institute of Technology,0.098499
The Ohio State University,0.097851


# Export Graph

In [80]:
nx.write_graphml(G,'../data/g.xml')