In [53]:
import pandas as pd
import re
import json
import networkx as nx

In [2]:
file_path = 'data/re-raleigh/kerri.xlsx'
excel_data = pd.ExcelFile(file_path)

In [3]:
# Check the sheet names
sheet_names = excel_data.sheet_names
print(sheet_names)

['nodes', 'edges', 'adjacency (sparse)', 'adjacency (full)']


In [4]:
# Load the 'nodes' and 'edges' sheets
nodes_excel = pd.read_excel(file_path, sheet_name='nodes', header=None)
edges_excel = pd.read_excel(file_path, sheet_name='edges', header=None)

In [14]:
nodes_excel[:5]

Unnamed: 0,0
0,'John Strutt (P1904)'
1,'Joseph Thomson (P1906)'
2,'Lawrence Bragg (P1915)'
3,'Charles Barkla (P1917)'
4,'Niels Bohr (P1922)'


In [15]:
edges_excel[:5]

Unnamed: 0,0,1
0,'John Strutt (P1904)','Joseph Thomson (P1906)'
1,'Joseph Thomson (P1906)','Lawrence Bragg (P1915)'
2,'Joseph Thomson (P1906)','Charles Barkla (P1917)'
3,'Joseph Thomson (P1906)','Niels Bohr (P1922)'
4,'Joseph Thomson (P1906)','Ernst Rutherford (C1908)'


In [19]:
# For each value in nodes_excel remove the speach marks around the value
nodes = nodes_excel.map(lambda x: x.strip('\''))

In [21]:
# For each value in edges_excel remove the speach marks around the value
edges = edges_excel.map(lambda x: x.strip('\''))

In [23]:
edges.head()

Unnamed: 0,0,1
0,John Strutt (P1904),Joseph Thomson (P1906)
1,Joseph Thomson (P1906),Lawrence Bragg (P1915)
2,Joseph Thomson (P1906),Charles Barkla (P1917)
3,Joseph Thomson (P1906),Niels Bohr (P1922)
4,Joseph Thomson (P1906),Ernst Rutherford (C1908)


In [25]:
# Match names with a Nobel prize annotation
# strings that end with parenthesis, followed by either P, C, M or E and then a four digit year.
nobel_pattern = re.compile(r'^.*\([PCME]\d{4}\)')

In [27]:
# Create a directed graph
academic_tree = nx.DiGraph()

In [30]:
for index, row in edges.iterrows():
    source = row[0]
    target = row[1]
    academic_tree.add_edge(source, target)

In [32]:
list(academic_tree.nodes())[:5]

['John Strutt (P1904)',
 'Joseph Thomson (P1906)',
 'Lawrence Bragg (P1915)',
 'Charles Barkla (P1917)',
 'Niels Bohr (P1922)']

In [38]:
# Add Nobel Prize attribute
for node in academic_tree.nodes:
    # If the name matches the Nobel pattern, add the Nobel attribute
    if nobel_pattern.match(node):
        academic_tree.nodes[node]['nobel'] = True
    else:
        academic_tree.nodes[node]['nobel'] = False

In [36]:
# In fact, Rayleigh only had one Nobellist among his trainees - Joseph Thomson, who won his Nobel in 1906. 
list(academic_tree.successors('John Strutt (P1904)'))

['Joseph Thomson (P1906)']

In [39]:
# Thomson really got the tree going – he trained 7 physics Nobellists and 2 in chemistry. 
list(academic_tree.successors('Joseph Thomson (P1906)'))

['Lawrence Bragg (P1915)',
 'Charles Barkla (P1917)',
 'Niels Bohr (P1922)',
 'Ernst Rutherford (C1908)',
 'Owen Richardson (P1928)',
 'Hugh Callendar',
 'Charles Wilson (P1927)',
 'Max Born (P1954)',
 'Clinton Davisson (P1937)',
 'George Thomson (P1937)',
 'Edward Appleton (P1946)',
 'Robert Oppenheimer',
 'Paul Langevin',
 'Eli Burton',
 'John Townsend',
 'David Keys',
 'Thomas Laby',
 'Peter Pringsheim',
 'Francis Aston (C1922)',
 'Reginald James']

In [40]:
# Show a list of Joseph Thomson's Nobel laureate students
[node for node in list(academic_tree.successors('Joseph Thomson (P1906)')) if academic_tree.nodes[node]['nobel']]

['Lawrence Bragg (P1915)',
 'Charles Barkla (P1917)',
 'Niels Bohr (P1922)',
 'Ernst Rutherford (C1908)',
 'Owen Richardson (P1928)',
 'Charles Wilson (P1927)',
 'Max Born (P1954)',
 'Clinton Davisson (P1937)',
 'George Thomson (P1937)',
 'Edward Appleton (P1946)',
 'Francis Aston (C1922)']

In [42]:
# Apply a level value to each node in the tree to show the distance to the root node
levels = nx.shortest_path_length(academic_tree, source='John Strutt (P1904)')
for node, level in levels.items():
    academic_tree.nodes[node]['level'] = level

In [45]:
levels

{'John Strutt (P1904)': 0,
 'Joseph Thomson (P1906)': 1,
 'Lawrence Bragg (P1915)': 2,
 'Hugh Callendar': 2,
 'George Thomson (P1937)': 2,
 'Thomas Laby': 2,
 'Robert Oppenheimer': 2,
 'Charles Barkla (P1917)': 2,
 'Paul Langevin': 2,
 'Ernst Rutherford (C1908)': 2,
 'Max Born (P1954)': 2,
 'Reginald James': 2,
 'Peter Pringsheim': 2,
 'Clinton Davisson (P1937)': 2,
 'Charles Wilson (P1927)': 2,
 'Edward Appleton (P1946)': 2,
 'Niels Bohr (P1922)': 2,
 'Owen Richardson (P1928)': 2,
 'David Keys': 2,
 'Eli Burton': 2,
 'Francis Aston (C1922)': 2,
 'John Townsend': 2,
 'John Ratcliffe': 3,
 'Arthur Compton (P1927)': 3,
 'John Cockcroft (P1951)': 3,
 'Don Yost': 3,
 'Isidor Rabi (P1944)': 3,
 'Yoshio Nishina': 3,
 'Willis Lamb (P1955)': 3,
 'Lev Landau (P1962)': 3,
 'Aage Bohr (P1975)': 3,
 'David Bohm': 3,
 'Wolfgang Pauli (P1945)': 3,
 'Philip Morrison': 3,
 'Nevill Mott (P1977)': 3,
 'Ernest Walton (P1951)': 3,
 'David Shoenberg': 3,
 'Otto Frisch': 3,
 'Edmond Bauer': 3,
 'Frederick S

In [46]:
for node in academic_tree.successors('Joseph Thomson (P1906)'):
    print(node, academic_tree.nodes[node])

Lawrence Bragg (P1915) {'nobel': True, 'level': 2}
Charles Barkla (P1917) {'nobel': True, 'level': 2}
Niels Bohr (P1922) {'nobel': True, 'level': 2}
Ernst Rutherford (C1908) {'nobel': True, 'level': 2}
Owen Richardson (P1928) {'nobel': True, 'level': 2}
Hugh Callendar {'nobel': False, 'level': 2}
Charles Wilson (P1927) {'nobel': True, 'level': 2}
Max Born (P1954) {'nobel': True, 'level': 2}
Clinton Davisson (P1937) {'nobel': True, 'level': 2}
George Thomson (P1937) {'nobel': True, 'level': 2}
Edward Appleton (P1946) {'nobel': True, 'level': 2}
Robert Oppenheimer {'nobel': False, 'level': 2}
Paul Langevin {'nobel': False, 'level': 2}
Eli Burton {'nobel': False, 'level': 2}
John Townsend {'nobel': False, 'level': 2}
David Keys {'nobel': False, 'level': 2}
Thomas Laby {'nobel': False, 'level': 2}
Peter Pringsheim {'nobel': False, 'level': 2}
Francis Aston (C1922) {'nobel': True, 'level': 2}
Reginald James {'nobel': False, 'level': 2}


In [47]:
for node in academic_tree.successors('Lawrence Bragg (P1915)'):
    print(node, academic_tree.nodes[node])

Edward Appleton (P1946) {'nobel': True, 'level': 2}
Max Perutz (C1962) {'nobel': True, 'level': 3}
John Kendrew (C1962) {'nobel': True, 'level': 3}


In [48]:
# Extract nodes and links for d3
nodes_d3 = [{"id": person, "nobel": academic_tree.nodes[person].get('nobel', False), "level": academic_tree.nodes[person].get('level')} for person in academic_tree.nodes]
links_d3 = [{"source": source, "target": target} for source, target in academic_tree.edges]

In [49]:
nodes_d3[:5]

[{'id': 'John Strutt (P1904)', 'nobel': True, 'level': 0},
 {'id': 'Joseph Thomson (P1906)', 'nobel': True, 'level': 1},
 {'id': 'Lawrence Bragg (P1915)', 'nobel': True, 'level': 2},
 {'id': 'Charles Barkla (P1917)', 'nobel': True, 'level': 2},
 {'id': 'Niels Bohr (P1922)', 'nobel': True, 'level': 2}]

In [51]:
# Create the final dictionary
d3_data = {
    "nodes": nodes_d3,
    "links": links_d3
}

In [52]:
# Save the data to a JSON file
d3_file_path = './data/nobel-tree-john-strutt-subgraph.json'

In [54]:
with open(d3_file_path, 'w') as file:
    json.dump(d3_data, file)