In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import string
import datetime

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('../data/math_geneology_final.csv')
#df = df.head(500)
df = df.fillna(' ')

# Clean Data

In [5]:
#### Ensure Certain Columns Exist
cols = ['advisor_year','student_year','advisor_school','student_school']
for col in cols:
    df = df.loc[df[col].notnull() & (df[col] != ' '),:]

#### School Names

In [6]:
for col in ['student_school','advisor_school']:
    df[col] = [x.strip() for x in df[col]]

#### Years

In [7]:
def fix_year(char, col):
    
    # Some people have 2 years in the year column
    # Years are usually split by "," or "/" or " " or "-". Keep only the first year
    rows_to_fix = [len(str(x)) > 4 for x in df[col]]
    df.loc[rows_to_fix,col] = [x.split(char)[0].strip() for x in df.loc[rows_to_fix,col]]
    
for char in [',','/',' ','-']:
    fix_year(char,'advisor_year')
    fix_year(char,'student_year')    

In [9]:
# Convert to Integer
df = df.loc[(df.student_year != 'Aug.'),:]
df.advisor_year = [int(x) for x in df.advisor_year]
df.student_year = [int(x) for x in df.student_year]
df = df.query('student_year < 2020 & student_year > 1000')

# Graph

In [11]:
def make_school_digraph(df_year):
    schools_df = pd.concat([df_year[[c]].rename(columns = {c: "school"}) for c in ["student_school", "advisor_school"]]).drop_duplicates()
    
    school_digraph = nx.DiGraph()
    # Nodes are schools (student and advisors) identified by name
    school_digraph.add_nodes_from(schools_df.school.values)
    
    # Edges connect advisor and student schools and are weighted by the number of advisor-students
    edges_df = df_year.groupby(["student_school", "advisor_school"], as_index = False).agg({"student_id": "count"})
    edges = [(e["advisor_school"], e["student_school"], {"weight": e["student_id"]}) for e in edges_df.to_dict(orient = "records")]
    school_digraph.add_edges_from(edges)
    return school_digraph

In [12]:
df_filtered = df.query('student_year == 2010')
school_digraph = make_school_digraph(df_filtered)

# Analysis

Preliminary Notes:
- A path is a sequence of nodes with property that each consecutive pair in the sequence is connected by an edge. In this context, a path [A -> B -> C] would mean that at least one student graduated from school A then subsequently taught students at school B AND at least one student graduated from school B then subsequently taught students at school C.

#### Centrality:

**Degree**: The degree centrality for a node v is the fraction of nodes it is connected to.

- A school with high degree centrality receives/sends professors to/from a wide variety of schools

**Eigenvector**: Eigenvector centrality computes the centrality for a node based on the centrality of its neighbors. (using adjacency matrix)

- A school with high eigenvector centrality receives/sends professors to/from schools that are also highly ranked

**Closeness**: Reciprocal of the average shortest path distance incoming to the node over all reachable nodes (use G.reverse() to use outward distance)

**Betwenness**: Sum of the fraction of all-pairs shortest paths that pass through the node

In [None]:
def get_centrality_measures(G):

    centrality_measures = [
        ['degree', nx.degree_centrality(G)],
        ['in_degree', nx.in_degree_centrality(G)],
        ['out_degree', nx.out_degree_centrality(G)],
        ['in_eigenvector', nx.eigenvector_centrality(G, weight='weight')],
        ['out_eigenvector', nx.eigenvector_centrality(G.reverse(), weight='weight')],    
        ['in_closeness', nx.closeness_centrality(G)],
        ['out_closeness', nx.closeness_centrality(G.reverse())],
        ['betweenness',nx.betweenness_centrality(G, weight='weight')]
    ]
    
    top_tens = [pd.Series(measure[1]).sort_values(ascending=False) for measure in centrality_measures]
    df_centrality = pd.concat(top_tens,axis=1,sort=False)
    df_centrality.columns = [measure[0] for measure in centrality_measures]
    df_centrality['school'] = df_centrality.index
    
    return(df_centrality)

In [None]:
school_centralities = get_centrality_measures(school_digraph)
school_centralities.corr()

In [None]:
school_centralities.sort_values('out_eigenvector',ascending=False).head(30)

In [None]:
df['student_year'][df['student_year']<1900].hist()

# Centralities Over Time

In [None]:
school_digraph_list = []
centralities_over_time_list = []
years = range(1800,2018,10)

for year in years:
    print(year)
    df_students_year = df.query('student_year >= @year & student_year < @year + 10')
    school_digraph = make_school_digraph(df_students_year)
    df_centralities_year = get_centrality_measures(school_digraph)
    df_centralities_year['year'] = year
    
    school_digraph_list.append(school_digraph)
    centralities_over_time_list.append(df_centralities_year)

In [None]:
df_centralities = pd.concat(centralities_over_time_list,axis=0,sort=False)

# Lineplots

In [None]:
df_centralities['year'].value_counts()

In [None]:
from bokeh.io import show
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.palettes import cividis
from bokeh.plotting import figure

In [None]:
df_centralities.head()

In [None]:
# Subset to Schools that are in top N-number of schools in at least one decade

def line_plot_schools(df_centralities, centrality_measure, n):

    schools_to_keep = []
    for year in df_centralities['year'].unique():
        schools = df_centralities.loc[df_centralities['year'] == year,:].sort_values(centrality_measure, ascending = False)['school'][0:n]
        schools_to_keep += list(schools)
    df_centralities_subset = df_centralities.loc[[x in list(set(schools_to_keep)) for x in df_centralities['school']],:]   
    
    
    # Plot (Copied and pasted from some blog -- hence the messiness)
    
    df = df_centralities_subset
    df = df.rename({'year':'x',centrality_measure:'y','school':'group'},axis=1)

    grp_list = df.group.unique()
    xs = [df.loc[df.group == i].x for i in grp_list]
    ys = [df.loc[df.group == i].y for i in grp_list]
    source = ColumnDataSource(data=dict(
         x = xs,
         y = ys,
         color = cividis(len(grp_list)),
         group = grp_list))
    p3 = figure(plot_width=1600, plot_height=900)
    p3.multi_line(
         xs='x',
         ys='y',
         legend=False,
         source=source,
         line_color='color')
    #Add hover tools, basically an invisible line
    source2 = ColumnDataSource(dict(
         invisible_xs=df.x,
         invisible_ys=df.y,
         group = df.group))
    line = p3.line(
         'invisible_xs',
         'invisible_ys',
         source=source2,
         alpha=0)
    #hover = HoverTool(tooltips =[
    #     ('group','@group')])
    #hover.renderers = [line]
    #p3.add_tools(hover)
    p3.add_tools(HoverTool(show_arrow=False, line_policy='nearest', tooltips=[
        ('group', '@group')
    ]))

    show(p3)

In [None]:
line_plot_schools(df_centralities, 'degree',50)

In [None]:
line_plot_schools(df_centralities, 'in_degree',50)

In [None]:
line_plot_schools(df_centralities, 'out_degree',50)

In [None]:
line_plot_schools(df_centralities, 'in_eigenvector',50)

In [None]:
line_plot_schools(df_centralities, 'out_eigenvector',50)

In [None]:
line_plot_schools(df_centralities, 'in_closeness',50)

In [None]:
line_plot_schools(df_centralities, 'out_closeness',50)

In [None]:
line_plot_schools(df_centralities, 'betweenness',50)

# Network Plots

In [None]:
def draw_graph_layout(graph, layout, ax):
    ax.scatter([v[0] for k, v in layout.items()], [v[1] for k, v in layout.items()], s = 2)
    for e in [e for e in graph.edges]:
        ax.plot([layout[i][0] for i in e], [layout[i][1] for i in e], c = "steelblue", lw = 0.1)

In [None]:
a = nx.kamada_kawai_layout(school_digraph)

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))
draw_graph_layout(school_digraph, a, ax)
