In [1]:
import pandas as pd
import re
import json
import networkx as nx

In [2]:
file_path = 'data/re-raleigh/kerri.xlsx'
excel_data = pd.ExcelFile(file_path)

In [3]:
# Check the sheet names
sheet_names = excel_data.sheet_names
print(sheet_names)

['nodes', 'edges', 'adjacency (sparse)', 'adjacency (full)']


In [4]:
# Load 'edges' sheets
edges_excel = pd.read_excel(file_path, sheet_name='edges', header=None)

In [5]:
edges_excel[:5]

Unnamed: 0,0,1
0,'John Strutt (P1904)','Joseph Thomson (P1906)'
1,'Joseph Thomson (P1906)','Lawrence Bragg (P1915)'
2,'Joseph Thomson (P1906)','Charles Barkla (P1917)'
3,'Joseph Thomson (P1906)','Niels Bohr (P1922)'
4,'Joseph Thomson (P1906)','Ernst Rutherford (C1908)'


In [6]:
# For each value in edges_excel remove the speach marks around the value
edges = edges_excel.map(lambda x: x.strip('\''))

In [7]:
edges.head()

Unnamed: 0,0,1
0,John Strutt (P1904),Joseph Thomson (P1906)
1,Joseph Thomson (P1906),Lawrence Bragg (P1915)
2,Joseph Thomson (P1906),Charles Barkla (P1917)
3,Joseph Thomson (P1906),Niels Bohr (P1922)
4,Joseph Thomson (P1906),Ernst Rutherford (C1908)


In [8]:
# Match names with a Nobel prize annotation
# strings that end with parenthesis, followed by either P, C, M or E and then a four digit year.
nobel_pattern = re.compile(r'^(.*)\s\(([PCME])(\d{4})\)')

In [9]:
# Create a directed graph
academic_tree = nx.DiGraph()

In [10]:
for index, row in edges.iterrows():
    source = row[0]
    target = row[1]
    academic_tree.add_edge(source, target)

In [11]:
list(academic_tree.nodes())[:5]

['John Strutt (P1904)',
 'Joseph Thomson (P1906)',
 'Lawrence Bragg (P1915)',
 'Charles Barkla (P1917)',
 'Niels Bohr (P1922)']

In [12]:
list(academic_tree.edges())[:5]

[('John Strutt (P1904)', 'Joseph Thomson (P1906)'),
 ('Joseph Thomson (P1906)', 'Lawrence Bragg (P1915)'),
 ('Joseph Thomson (P1906)', 'Charles Barkla (P1917)'),
 ('Joseph Thomson (P1906)', 'Niels Bohr (P1922)'),
 ('Joseph Thomson (P1906)', 'Ernst Rutherford (C1908)')]

In [13]:
# Add Nobel Prize attribute
for node in academic_tree.nodes:
    academic_tree.nodes[node]['nobel'] = {}
    # If the name matches the Nobel pattern, add the Nobel attribute
    match = nobel_pattern.match(node)
    if match:
        academic_tree.nodes[node]['nobel']['award'] = True
        academic_tree.nodes[node]['nobel']['category'] = match.group(2)
        academic_tree.nodes[node]['nobel']['year'] = int(match.group(3))
    else:
        academic_tree.nodes[node]['nobel']['award'] = False
        academic_tree.nodes[node]['nobel']['category'] = 'NA'
        academic_tree.nodes[node]['nobel']['year'] = 0

In [14]:
# John Strutt is correctly listed as a laureate
academic_tree.nodes['John Strutt (P1904)']

{'nobel': {'award': True, 'category': 'P', 'year': 1904}}

In [15]:
# In fact, Rayleigh only had one Nobellist among his trainees - Joseph Thomson, who won his Nobel in 1906. 
list(academic_tree.successors('John Strutt (P1904)'))

['Joseph Thomson (P1906)']

In [16]:
# Thomson really got the tree going – he trained 7 physics Nobellists and 2 in chemistry. 
list(academic_tree.successors('Joseph Thomson (P1906)'))

['Lawrence Bragg (P1915)',
 'Charles Barkla (P1917)',
 'Niels Bohr (P1922)',
 'Ernst Rutherford (C1908)',
 'Owen Richardson (P1928)',
 'Hugh Callendar',
 'Charles Wilson (P1927)',
 'Max Born (P1954)',
 'Clinton Davisson (P1937)',
 'George Thomson (P1937)',
 'Edward Appleton (P1946)',
 'Robert Oppenheimer',
 'Paul Langevin',
 'Eli Burton',
 'John Townsend',
 'David Keys',
 'Thomas Laby',
 'Peter Pringsheim',
 'Francis Aston (C1922)',
 'Reginald James']

In [17]:
# Show a list of Joseph Thomson's Nobel laureate students
[node for node in list(academic_tree.successors('Joseph Thomson (P1906)')) if academic_tree.nodes[node]['nobel']['award']]

['Lawrence Bragg (P1915)',
 'Charles Barkla (P1917)',
 'Niels Bohr (P1922)',
 'Ernst Rutherford (C1908)',
 'Owen Richardson (P1928)',
 'Charles Wilson (P1927)',
 'Max Born (P1954)',
 'Clinton Davisson (P1937)',
 'George Thomson (P1937)',
 'Edward Appleton (P1946)',
 'Francis Aston (C1922)']

In [18]:
# Apply a level value to each node in the tree to show the distance to the root node
levels = nx.shortest_path_length(academic_tree, source='John Strutt (P1904)')
for node, level in levels.items():
    academic_tree.nodes[node]['level'] = level

In [19]:
# Show the first entries in the levels dict
list(levels.items())[:5]

[('John Strutt (P1904)', 0),
 ('Joseph Thomson (P1906)', 1),
 ('George Thomson (P1937)', 2),
 ('Charles Wilson (P1927)', 2),
 ('Max Born (P1954)', 2)]

In [20]:
for node in academic_tree.successors('Joseph Thomson (P1906)'):
    print(node, academic_tree.nodes[node])

Lawrence Bragg (P1915) {'nobel': {'award': True, 'category': 'P', 'year': 1915}, 'level': 2}
Charles Barkla (P1917) {'nobel': {'award': True, 'category': 'P', 'year': 1917}, 'level': 2}
Niels Bohr (P1922) {'nobel': {'award': True, 'category': 'P', 'year': 1922}, 'level': 2}
Ernst Rutherford (C1908) {'nobel': {'award': True, 'category': 'C', 'year': 1908}, 'level': 2}
Owen Richardson (P1928) {'nobel': {'award': True, 'category': 'P', 'year': 1928}, 'level': 2}
Hugh Callendar {'nobel': {'award': False, 'category': 'NA', 'year': 0}, 'level': 2}
Charles Wilson (P1927) {'nobel': {'award': True, 'category': 'P', 'year': 1927}, 'level': 2}
Max Born (P1954) {'nobel': {'award': True, 'category': 'P', 'year': 1954}, 'level': 2}
Clinton Davisson (P1937) {'nobel': {'award': True, 'category': 'P', 'year': 1937}, 'level': 2}
George Thomson (P1937) {'nobel': {'award': True, 'category': 'P', 'year': 1937}, 'level': 2}
Edward Appleton (P1946) {'nobel': {'award': True, 'category': 'P', 'year': 1946}, 'l

In [21]:
for node in academic_tree.successors('Lawrence Bragg (P1915)'):
    print(node, academic_tree.nodes[node])

Edward Appleton (P1946) {'nobel': {'award': True, 'category': 'P', 'year': 1946}, 'level': 2}
Max Perutz (C1962) {'nobel': {'award': True, 'category': 'C', 'year': 1962}, 'level': 3}
John Kendrew (C1962) {'nobel': {'award': True, 'category': 'C', 'year': 1962}, 'level': 3}


In [22]:
# Extract nodes and links for d3
nodes_d3 = [{"id": person, "nobel": academic_tree.nodes[person].get('nobel', {}), "level": academic_tree.nodes[person].get('level')} for person in academic_tree.nodes]
links_d3 = [{"source": source, "target": target} for source, target in academic_tree.edges]

In [23]:
print(json.dumps(nodes_d3[:5], indent=2))

[
  {
    "id": "John Strutt (P1904)",
    "nobel": {
      "award": true,
      "category": "P",
      "year": 1904
    },
    "level": 0
  },
  {
    "id": "Joseph Thomson (P1906)",
    "nobel": {
      "award": true,
      "category": "P",
      "year": 1906
    },
    "level": 1
  },
  {
    "id": "Lawrence Bragg (P1915)",
    "nobel": {
      "award": true,
      "category": "P",
      "year": 1915
    },
    "level": 2
  },
  {
    "id": "Charles Barkla (P1917)",
    "nobel": {
      "award": true,
      "category": "P",
      "year": 1917
    },
    "level": 2
  },
  {
    "id": "Niels Bohr (P1922)",
    "nobel": {
      "award": true,
      "category": "P",
      "year": 1922
    },
    "level": 2
  }
]


In [24]:
# Create the final dictionary
d3_data = {
    "nodes": nodes_d3,
    "links": links_d3
}

In [25]:
# Save the data to a JSON file
d3_file_path = './data/nobel-tree-john-strutt-subgraph.json'

In [26]:
with open(d3_file_path, 'w') as file:
    json.dump(d3_data, file)