# Network analysis for coauthors - network through time

This looks at how the network changes through time with network metrics.

In [8]:
%load_ext autoreload
%autoreload 2

from src import util as u

import pandas as pd
from functools import reduce
import operator
import networkx as nx

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
nodes = pd.read_csv(u.fn_nodes)

In [10]:
# Import author data with dates
start = 1758
end = 2018
years = end-start
timepoints = 4
interval = round(years/3, 0)
year1= start + interval*1
year2= start + interval*2

In [11]:
print(start, year1)
print(year1, year2) # postal mail invented
print(year2, end)   # airlines invested

1758 1845.0
1845.0 1932.0
1932.0 2018


In [18]:
auth = u.get_spp_df()
auth = auth.drop(columns=['status'])


auth1 = auth[(auth['date'] >= start) & (auth['date']  < year1)][['idx', 'full_name_of_describer']]
auth2 = auth[(auth['date'] >= year1) & (auth['date']  < year2)][['idx', 'full_name_of_describer']]
auth3 = auth[(auth['date'] >= year2) & (auth['date']  <= end)][['idx', 'full_name_of_describer']]

df_li =  [auth1, auth2, auth3]
df_li = [x.groupby('idx')['full_name_of_describer'].apply(lambda x: "%s" % '; '.join(x)) for x in df_li]

  if (await self.run_code(code, result,  async_=asy)):
  spp = spp[~spp['duplicated']][


In [19]:
# Get pairs

def li_pairs(source):
    source = str.split(source, "; ")
    result = []
    N = len(source)
    if (N <= 1):
        return [(source[0], None)]
    else:

        for p1 in range(N):
            for p2 in range(p1+1,len(source)):
                    result.append((source[p1],source[p2]))
        return result


In [20]:
print(df_li[2].iloc[2])
print(li_pairs(df_li[2].iloc[2]))
print(li_pairs(df_li[2].iloc[63]))

Ricardo Ayala Barajas; Michael Scott Engel
[('Ricardo Ayala Barajas', 'Michael Scott Engel')]
[('Fritz Josef [Friedrich] Gusenleitner', 'Maximilian Schwarz')]


In [21]:
df_li = [x.apply(lambda x: li_pairs(x)) for x in df_li]

In [22]:
li = [[] for i in range(3)]
for i in range(len(df_li)):
    df = df_li[i]
    for j, row in df.iteritems():
        li[i].append(row[0])

In [23]:
df_li = [pd.DataFrame(x, columns=['p1', 'p2']) for x in li]
df_li = [pd.DataFrame(x).groupby(['p1', 'p2']).size() for x in df_li]
df_li = [x.reset_index() for x in df_li]

In [24]:
def get_edges(df):
    edges = []
    for i, row in df.iterrows():
        edges.append((row.p1, row.p2, row[0]))
    return edges

In [25]:
df_li = [get_edges(df) for df in df_li]

In [26]:
def get_node_names(edges):
    all_node_names = []
    all_node_names = [all_node_names + [e[0], e[1]] for e in edges]
    all_node_names = reduce(operator.add, all_node_names)
    return set(all_node_names)

In [27]:
node_names = [get_node_names(df) for df in df_li]

In [28]:
# Load into graph
g_li = [nx.Graph() for x in range(3)]
for i in range(len(df_li)):

    g_li[i].add_nodes_from(node_names[i])
    g_li[i].add_weighted_edges_from(df_li[i])

In [29]:
for i in range(len(df_li)):
    print("Network", i)
    
    density = nx.density(g_li[i])
    print("Network density:", round(density*100, 1), "%")
    
    triadic_closure = nx.transitivity(g_li[i])
    print("Triadic closure:", round(triadic_closure*100, 1), "%")
    
    # Get subgraphs
    subgraphs = [c for c in sorted(nx.connected_components(g_li[i]), key=len, reverse=True)]
    print("Number of subgraphs:", len(subgraphs))
    
    # Largest component
    components = nx.connected_components(g_li[i])
    largest_component = max(components, key=len) # max number of nodes

    # Create a "subgraph" of just the largest component
    # Then calculate the diameter of the subgraph, just like you did with density.
    subgraph = g_li[i].subgraph(largest_component)
    diameter = nx.diameter(subgraph)
    print("Network diameter of largest component:", diameter)
    
    print([v[0] for v in g_li[i].nodes(data=True)])
    
    
    print("\n")

Network 0
Network density: 33.3 %
Triadic closure: 0 %
Number of subgraphs: 2
Network diameter of largest component: 1
['Léon [Jean Marie] Dufour', '[Jean-Pierre Omer Anne] Édouard Perris', 'Jean Guillaume Audinet-Serville', 'Amédée Louis Michel le Peletier, comte de Saint-Fargeau']


Network 1
Network density: 4.0 %
Triadic closure: 0 %
Number of subgraphs: 11
Network diameter of largest component: 3
['Lucy Evelyn Cheesman', 'Mariano Lucia', 'Charles H. Hicks', 'Shōnen Matsumura', 'Arthur Gibson', 'Woldemar Trautmann', 'John Harvey Lovell', 'Wilmatte Porter Cockerell', 'Theodore Dru Alison Cockerell', 'Edward Sharpe Gaige Titus', 'Grace Adelbert Sandhouse', 'Franz von Wagner', 'Myron Harmon Swenk', 'Hans Bischoff', 'Henry Lorenz Viereck', 'Alberto H. Abrahamovich', 'Jessie E. Casad', 'Léon Marc Herminie Fairmaire', 'Ruth Sumner', 'Tohru Uchida', 'Karl Wilhelm von Dalla Torre', 'Otto Emil Plath', 'Joseph Charles Bequaert', 'Robert Cyril Layton Perkins', 'Hans [Franz Paul] Hedicke', 'Ar