In [40]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import string
import datetime

In [64]:
import matplotlib.pyplot as plt
%matplotlib inline

In [41]:
df = pd.read_csv('../data/math_geneology_final.csv')
#df = df.head(500)
df = df.fillna(' ')

In [42]:
for col in ['student_school','advisor_school']:
    df[col] = [x.strip() for x in df[col]]

# Clean Data

#### Advisor Year

In [43]:
df = df.loc[df.advisor_year.notnull() & (df.advisor_year != ' '),:]
set([len(x) for x in df.advisor_year])

{3, 4, 9, 10, 11, 14}

In [44]:
# Fix year with length = 3
[x for x in df.advisor_year if len(x) < 4]
df.loc[df.advisor_year == "199",'advisor_year'] = 1994

In [45]:
# Some people have 2 years in the year column
# Years are usually split by "," or "/" or " " or "-". Keep only the first year
df.loc[[len(str(x)) > 4 for x in df.advisor_year],'advisor_year'] = [x.split(',')[0].strip() for x in df.advisor_year if len(str(x)) > 4]
df.loc[[len(str(x)) > 4 for x in df.advisor_year],'advisor_year'] = [x.split('/')[0].strip() for x in df.advisor_year if len(str(x)) > 4]
df.loc[[len(str(x)) > 4 for x in df.advisor_year],'advisor_year'] = [x.split(' ')[0].strip() for x in df.advisor_year if len(str(x)) > 4]
df.loc[[len(str(x)) > 4 for x in df.advisor_year],'advisor_year'] = [x.split('-')[0].strip() for x in df.advisor_year if len(str(x)) > 4]

In [47]:
# Convert to Integer
df.advisor_year = [int(x) for x in df.advisor_year]

# Nodes

In [48]:
schools_1 = (df
             .groupby('student_school')
             .agg({'student_id':pd.Series.nunique})
             .reset_index()
             .rename({'student_school':'school',
                      'student_id':'id'},axis=1))
schools_2 = (df.groupby('advisor_school')
             .agg({'advisor_name':pd.Series.nunique})
             .reset_index()
             .rename({'advisor_school':'school',
                      'advisor_name':'id'},axis=1))
df_schools = pd.concat([schools_1,schools_2],axis=0,sort=False)

In [49]:
df_schools = df_schools.loc[df_schools.school != '',:].reset_index(drop=True)
df_nodes = df_schools.groupby('school').agg({'id':'sum'}).reset_index()

In [50]:
ids = df_nodes['school']
nodes_dict = [{'id':df_nodes.loc[i,'id']} for i in range(df_nodes.shape[0])]
nodes = [x for x in zip(ids,nodes_dict)]

# Edges

In [52]:
df_filtered = df.query('advisor_year > 2010')

In [53]:
df_edges = df_filtered.groupby(['advisor_school','student_school']).agg({'id':'count'}).reset_index()
df_edges['same_school'] = df_edges['advisor_school'] == df_edges['student_school']
df_edges = df_edges.loc[(df_edges.student_school != '') & 
                        (df_edges.advisor_school != '') &
                        (df_edges.same_school == False),:].reset_index(drop=True)

In [54]:
edge_dict = [{'count':df_edges.loc[i,'id']} for i in range(df_edges.shape[0])]

In [55]:
edges = [x for x in zip(df_edges['advisor_school'],df_edges['student_school'],edge_dict)]

In [56]:
len(df_edges.loc[df_edges.advisor_school.str.contains('New York Univ'),:]['id'])

5

# Add to Graph

In [57]:
G = nx.DiGraph()

In [58]:
G.add_nodes_from(nodes)
G.add_edges_from(edges)

# Analysis

In [59]:
# Degree Centrality

cent = nx.degree_centrality(G)
df_cent = pd.DataFrame(pd.Series(cent))
df_cent.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
Massachusetts Institute of Technology,0.004698
Stanford University,0.004404
"University of California, Berkeley",0.00411
The Pennsylvania State University,0.003817
University of Minnesota-Minneapolis,0.00323
Georgia Institute of Technology,0.00323
Princeton University,0.002642
University of Illinois at Urbana-Champaign,0.002642
North Carolina State University,0.002642
University of Oxford,0.002349


In [60]:
# Out-Degree Centrality

cent = nx.out_degree_centrality(G)
df_cent = pd.DataFrame(pd.Series(cent))
df_cent.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
Massachusetts Institute of Technology,0.003523
Stanford University,0.003523
"University of California, Berkeley",0.00323
The Pennsylvania State University,0.002642
Georgia Institute of Technology,0.002055
Princeton University,0.002055
University of Illinois at Urbana-Champaign,0.002055
University of Michigan,0.001762
Gottfried Wilhelm Leibniz Universität Hannover,0.001762
Yale University,0.001762


In [61]:
# In-Degree Centrality

cent = nx.in_degree_centrality(G)
df_cent = pd.DataFrame(pd.Series(cent))
df_cent.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
North Carolina State University,0.002349
Purdue University,0.001762
University of Minnesota-Minneapolis,0.001468
Instituto de Matemática Pura e Aplicada,0.001174
The Pennsylvania State University,0.001174
ETH Zürich,0.001174
Georgia Institute of Technology,0.001174
Technische Universität Darmstadt,0.001174
The University of Chicago,0.001174
Massachusetts Institute of Technology,0.001174


In [63]:
# Eigenvector Centrality

cent = nx.eigenvector_centrality(G)
df_cent = pd.DataFrame(pd.Series(cent))
df_cent.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
North Carolina State University,0.325394
The University of Chicago,0.29367
University of Wisconsin-Madison,0.232226
Stanford University,0.224977
Massachusetts Institute of Technology,0.222221
Georgia Institute of Technology,0.174656
University of Notre Dame,0.173258
The Pennsylvania State University,0.170591
University of Connecticut,0.163699
University of Waterloo,0.157925


# Export Graph

In [80]:
nx.write_graphml(G,'../data/g.xml')

# Plots

In [65]:
def draw_graph_layout(graph, layout, ax):
    ax.scatter([v[0] for k, v in layout.items()], [v[1] for k, v in layout.items()], s = 2)
    for e in [e for e in graph.edges]:
        ax.plot([layout[i][0] for i in e], [layout[i][1] for i in e], c = "steelblue", lw = 0.1)