In [1]:
import pandas as pd
import re
import json
import networkx as nx

In [3]:
file_path = './data/kerrifull/kerrifull.xlsx'
excel_data = pd.ExcelFile(file_path)

In [4]:
# Check the sheet names
sheet_names = excel_data.sheet_names
print(sheet_names)

['nodes', 'edges', 'adjacency']


In [5]:
# Load the 'nodes' and 'edges' sheets
nodes_excel = pd.read_excel(file_path, sheet_name='nodes', header=None)
edges_excel = pd.read_excel(file_path, sheet_name='edges', header=None)

In [6]:
nodes_excel[:5]

Unnamed: 0,0
0,'Gustav Zeuner'
1,'Wilhelm Rontgen (P1901)'
2,'August Kundt'
3,'Victor Regnault'
4,'Julius Weisbach'


In [7]:
edges_excel[:5]

Unnamed: 0,0,1
0,'Gustav Zeuner','Wilhelm Rontgen (P1901)'
1,'Wilhelm Rontgen (P1901)','Abram Ioffe'
2,'Wilhelm Rontgen (P1901)','Rudolf Ladenburg'
3,'Wilhelm Rontgen (P1901)','Max Wien'
4,'Wilhelm Rontgen (P1901)','David Keys'


In [8]:
# For each value in edges_excel remove the speach marks around the value
edges = edges_excel.map(lambda x: x.strip('\''))

In [9]:
edges.head()

Unnamed: 0,0,1
0,Gustav Zeuner,Wilhelm Rontgen (P1901)
1,Wilhelm Rontgen (P1901),Abram Ioffe
2,Wilhelm Rontgen (P1901),Rudolf Ladenburg
3,Wilhelm Rontgen (P1901),Max Wien
4,Wilhelm Rontgen (P1901),David Keys


In [10]:
# Create a directed graph
academic_tree = nx.DiGraph()

In [11]:
for index, row in edges.iterrows():
    source = row[0]
    target = row[1]
    academic_tree.add_edge(source, target)

In [12]:
# Test that the graph was created successfully
list(academic_tree.edges())[:5]

[('Gustav Zeuner', 'Wilhelm Rontgen (P1901)'),
 ('Wilhelm Rontgen (P1901)', 'Abram Ioffe'),
 ('Wilhelm Rontgen (P1901)', 'Rudolf Ladenburg'),
 ('Wilhelm Rontgen (P1901)', 'Max Wien'),
 ('Wilhelm Rontgen (P1901)', 'David Keys')]

In [13]:
# A regular expression to extract the category of nobel prize and the year
# A Nobel prize annotation is indicated by:
# lines that end with parenthesis, followed by either P, C, M or E and then a four digit year.
# If the the string ends with a parenthesis containing C followed by a four digit year, the category is Chemistry
# If the the string ends with a parenthesis containing P followed by a four digit year, the category is Physics
# If the the string ends with a parenthesis containing M followed by a four digit year, the category is Medicine
# If the the string ends with a parenthesis containing E followed by a four digit year, the category is Economics
nobel_pattern = re.compile(r'^(.*)\s\(([PCME])(\d{4})\)')

In [14]:
# Test the regular expression with some examples
test_string = 'Hans von Euler-Chelpin (C1929)'
match = nobel_pattern.match(test_string)
if match:
    print(f"{test_string} has a Nobel prize annotation")
    name = match.group(1)
    category = match.group(2)
    year = int(match.group(3))
    print(f"Name: {name}, Category: {category}, Year: {year}")
else:
    print(f"{test_string} does not have a Nobel prize annotation")

test_string = 'Gustav Zeuner'
match = nobel_pattern.match(test_string)
if match:
    print(f"{test_string} has a Nobel prize annotation")
    name = match.group(1)
    category = match.group(2)
    year = int(match.group(3))
    print(f"Name: {name}, Category: {category}, Year: {year}")
else:
    print(f"{test_string} does not have a Nobel prize annotation")

Hans von Euler-Chelpin (C1929) has a Nobel prize annotation
Name: Hans von Euler-Chelpin, Category: C, Year: 1929
Gustav Zeuner does not have a Nobel prize annotation


In [15]:
# Add Nobel Prize attribute
for node in academic_tree.nodes:
    academic_tree.nodes[node]['nobel'] = {}
    # If the name matches the Nobel pattern, add the Nobel attribute
    match = nobel_pattern.match(node)
    if match:
        academic_tree.nodes[node]['nobel']['award'] = True
        academic_tree.nodes[node]['nobel']['category'] = match.group(2)
        academic_tree.nodes[node]['nobel']['year'] = int(match.group(3))
    else:
        academic_tree.nodes[node]['nobel']['award'] = False
        academic_tree.nodes[node]['nobel']['category'] = 'NA'
        academic_tree.nodes[node]['nobel']['year'] = 0

In [16]:
# Wilhelm Rontgen is correctly listed as a laureate
academic_tree.nodes['Wilhelm Rontgen (P1901)']

{'nobel': {'award': True, 'category': 'P', 'year': 1901}}

In [17]:
# 'Victor Regnault' is not a laureate
academic_tree.nodes['Victor Regnault']

{'nobel': {'award': False, 'category': 'NA', 'year': 0}}

In [18]:
# Print the number of nodes in the family tree where nobel.award is True
print(f"{len(academic_tree.nodes())}: scientists in the family tree")
print(f"Of whom, {len([node for node in academic_tree.nodes if academic_tree.nodes[node]['nobel']['award']])} have won a Nobel Prize")


3563: scientists in the family tree
Of whom, 722 have won a Nobel Prize


In [19]:
# Function to get descendants of a node
def get_descendants(graph, root):
    descendants = set()
    queue = [root]

    while queue:
        current_node = queue.pop(0)
        descendants.add(current_node)
        for successor in graph.successors(current_node):
            if successor not in descendants:
                queue.append(successor)
                
    return descendants

In [20]:
# Get descendants of the 'John Strutt (P1904)'
jonh_strutt_descendants = get_descendants(academic_tree, 'John Strutt (P1904)')

In [21]:
list(jonh_strutt_descendants)[:5]

['John Gunn',
 'Stylianos Antonarakis',
 'Eric Cornell (P2001)',
 'Joachim Frank (C2017)',
 'Francois Englert (P2013)']

In [22]:
# Test if 'Joseph Thomson (P1906)' is a descendant of 'John Strutt (P1904)'
'Joseph Thomson (P1906)' in jonh_strutt_descendants

True

In [23]:
'Galileo Galilei' in jonh_strutt_descendants

False

In [24]:
# Add john_strutt_descendants to the graph
for node in academic_tree.nodes:
    if node in jonh_strutt_descendants:
        academic_tree.nodes[node]['john_strutt_descendants'] = True
    else:
        academic_tree.nodes[node]['john_strutt_descendants'] = False

In [25]:
# Joseph Thomson (P1906) is correctly listed as a descendant of John Strutt
academic_tree.nodes['Joseph Thomson (P1906)']

{'nobel': {'award': True, 'category': 'P', 'year': 1906},
 'john_strutt_descendants': True}

In [26]:
# Galileo Galilei is not a descendant of John Strutt
academic_tree.nodes['Galileo Galilei']

{'nobel': {'award': False, 'category': 'NA', 'year': 0},
 'john_strutt_descendants': False}

In [27]:
# Find all connected components (weakly connected) in the graph
connected_components = nx.weakly_connected_components(academic_tree)

In [28]:
connected_components_list = list(connected_components)

In [29]:
len(connected_components_list)

19

In [30]:
# Analyze the size of each connected component
[len(component) for component in connected_components_list]

[3518, 2, 3, 2, 2, 4, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 6, 2, 3]

In [31]:
# Print the smaller connected components (Excluding the largest component with 3,476 entries)
connected_components_list[1:]

[{'Aurel Stodola', 'Gustaf Dalen (P1912)'},
 {'Natalia Sozina', 'Vladimir Tuchkevic', 'Zhores Alferov (P2000)'},
 {'Hiroshi Amano (P2014)', 'Isamu Nakasaki (P2014)'},
 {'Jens Skou (C1997)', 'Soren Orskov'},
 {'Archibald Preece',
  'Dan Shechtman (C2011)',
  'David Brandon',
  'Jack Nutting'},
 {'Patrick Manson', 'Ronald Ross (M1902)'},
 {'Frederick Banting (M1923)', 'John MacLeod (M1923)'},
 {'Henry Christian', 'William Murphy (M1934)'},
 {'Alexander Fleming (M1945)', 'Almroth Wright'},
 {'Andre Cournand (M1956)', 'Dickinson Richards'},
 {'George Hitchings', 'Gertrude Elion (M1988)'},
 {'James Fairbairn', 'Youyou Tu (M2015)', 'Zhicen Lou'},
 {'James Chesterton', 'Michael Houghton (M2020)'},
 {'Ragnar Frisch (E1969)', 'Trygve Haavelmo (E1989)'},
 {'Gerard Debreu (E1983)', 'Maurice Allais (E1988)'},
 {'Christopher Pissarides (E2010)',
  'Franklin Giddings',
  'Gabriel Tarde',
  'Michio Morishima',
  'Shotaro Yoneda',
  'Yasuma Takada'},
 {'M.M. Bousquet', 'Pierre Agostini (P2023)'},
 {'A

In [32]:
largest_component = connected_components_list[0]

In [33]:
len(largest_component)

3518

In [34]:
'Aurel Stodola' in largest_component

False

In [35]:
'Galileo Galilei' in largest_component

True

In [76]:
'John Strutt (P1904)' in largest_component

True

In [77]:
# Add annotation to nodes in academic_tree to indicate if they are in the largest connected component
for node in academic_tree.nodes:
    if node in largest_component:
        academic_tree.nodes[node]['main_family'] = True
    else:
        academic_tree.nodes[node]['main_family'] = False

In [80]:
# Joseph Thomson won the Physics Nobel Prize in 1906
# He is in the main family
# He is a descendant of John Strutt
academic_tree.nodes['Joseph Thomson (P1906)']

{'nobel': {'award': True, 'category': 'P', 'year': 1906},
 'john_strutt_descendants': True,
 'main_family': True}

In [81]:
# 'Jack Nutting' is not in the main family
# They are not a descendant of John Strutt
# They are not a Nobel laureate
academic_tree.nodes['Jack Nutting']


{'nobel': {'award': False, 'category': 'NA', 'year': 0},
 'john_strutt_descendants': False,
 'main_family': False}

In [82]:
# Extract nodes and links to print the graph in D3.js
nodes = [
    {
        "id": person, "nobel": academic_tree.nodes[person].get('nobel', {}), 
        "john_strutt_descendants": academic_tree.nodes[person].get('john_strutt_descendants', False),
        "main_family": academic_tree.nodes[person].get('main_family', False),
    } for person in academic_tree.nodes]
links = [{"source": source, "target": target} for source, target in academic_tree.edges]

In [83]:
nodes[:5]

[{'id': 'Gustav Zeuner',
  'nobel': {'award': False, 'category': 'NA', 'year': 0},
  'john_strutt_descendants': False,
  'main_family': True},
 {'id': 'Wilhelm Rontgen (P1901)',
  'nobel': {'award': True, 'category': 'P', 'year': 1901},
  'john_strutt_descendants': False,
  'main_family': True},
 {'id': 'August Kundt',
  'nobel': {'award': False, 'category': 'NA', 'year': 0},
  'john_strutt_descendants': False,
  'main_family': True},
 {'id': 'Victor Regnault',
  'nobel': {'award': False, 'category': 'NA', 'year': 0},
  'john_strutt_descendants': False,
  'main_family': True},
 {'id': 'Julius Weisbach',
  'nobel': {'award': False, 'category': 'NA', 'year': 0},
  'john_strutt_descendants': False,
  'main_family': True}]

In [84]:
links[:5]

[{'source': 'Gustav Zeuner', 'target': 'Wilhelm Rontgen (P1901)'},
 {'source': 'Wilhelm Rontgen (P1901)', 'target': 'Abram Ioffe'},
 {'source': 'Wilhelm Rontgen (P1901)', 'target': 'Rudolf Ladenburg'},
 {'source': 'Wilhelm Rontgen (P1901)', 'target': 'Max Wien'},
 {'source': 'Wilhelm Rontgen (P1901)', 'target': 'David Keys'}]

In [85]:
# Create the final dictionary
d3_data = {
    "nodes": nodes,
    "links": links
}

In [86]:
# Save the data to a JSON file
d3_file_path = './data/nobel-tree-full.json'

In [87]:
with open(d3_file_path, 'w') as file:
    json.dump(d3_data, file)