In [53]:
import pandas as pd
import re
import json
import networkx as nx

In [54]:
file_path = './data/kerrifull/kerrifull.xlsx'
excel_data = pd.ExcelFile(file_path)

In [55]:
# Check the sheet names
sheet_names = excel_data.sheet_names
print(sheet_names)

['nodes', 'edges', 'adjacency']


In [56]:
# Load the 'nodes' and 'edges' sheets
nodes_excel = pd.read_excel(file_path, sheet_name='nodes', header=None)
edges_excel = pd.read_excel(file_path, sheet_name='edges', header=None)

In [57]:
nodes_excel[:5]

Unnamed: 0,0
0,'Gustav Zeuner'
1,'Wilhelm Rontgen (P1901)'
2,'August Kundt'
3,'Victor Regnault'
4,'Julius Weisbach'


In [58]:
edges_excel[:5]

Unnamed: 0,0,1
0,'Gustav Zeuner','Wilhelm Rontgen (P1901)'
1,'Wilhelm Rontgen (P1901)','Abram Ioffe'
2,'Wilhelm Rontgen (P1901)','Rudolf Ladenburg'
3,'Wilhelm Rontgen (P1901)','Max Wien'
4,'Wilhelm Rontgen (P1901)','David Keys'


In [59]:
# For each value in nodes_excel and edges_excel remove the speach marks around the value
nodes = nodes_excel.map(lambda x: x.strip('\''))
edges = edges_excel.map(lambda x: x.strip('\''))

In [60]:
edges.head()

Unnamed: 0,0,1
0,Gustav Zeuner,Wilhelm Rontgen (P1901)
1,Wilhelm Rontgen (P1901),Abram Ioffe
2,Wilhelm Rontgen (P1901),Rudolf Ladenburg
3,Wilhelm Rontgen (P1901),Max Wien
4,Wilhelm Rontgen (P1901),David Keys


---

Determine if there are people listed in the nodes list who are not present in the edges list.

This would imply that there are people who have won a Nobel prize but are not part of any family tree.

In [61]:
# Extract each value from column one and two of the edges dataframe
# into a new single column dataframe with no duplicates

edges_each = pd.concat([edges[0], edges[1]]).drop_duplicates()

In [62]:
nodes[0].isin(edges_each).value_counts()

0
True     3563
False      14
Name: count, dtype: int64

In [63]:
# There are 14 people listed in the nodes column, who are not in the edges column
# Create a boolean mask of nodes that are in the edges DataFrame
mask = nodes[0].isin(edges_each)

# Filter out the nodes that are in the edges DataFrame
nodes_not_in_edges = nodes[0][mask.apply(lambda x: not x)]

print(nodes_not_in_edges)

849              Leo Esaki (P1973)
1139            Jack Kilby (P2000)
1293        Shuji Nakamura (P2014)
1368        Syokuro Manabe (P2021)
2014      Hideki Shirakawa (C2000)
2053         Koichi Tanaka (C2002)
2093          Yves Chauvin (C2005)
2389          Niels Finsen (M1903)
2521         Antonio Moniz (M1949)
2725    Godfrey Hounsfield (M1979)
3031          Robin Warren (M2005)
3032        Barry Marshall (M2005)
3178       Peter Ratcliffe (M2019)
3543       Aleksey Yekimov (C2023)
Name: 0, dtype: object


---

## Create the family tree

In [64]:
# Create a directed graph
academic_tree = nx.DiGraph()

In [65]:
for index, row in edges.iterrows():
    source = row[0]
    target = row[1]
    academic_tree.add_edge(source, target)

In [66]:
# Test that the graph was created successfully
list(academic_tree.edges())[:5]

[('Gustav Zeuner', 'Wilhelm Rontgen (P1901)'),
 ('Wilhelm Rontgen (P1901)', 'Abram Ioffe'),
 ('Wilhelm Rontgen (P1901)', 'Rudolf Ladenburg'),
 ('Wilhelm Rontgen (P1901)', 'Max Wien'),
 ('Wilhelm Rontgen (P1901)', 'David Keys')]

In [67]:
# Print the number of nodes in the graph
print(f"There are {len(academic_tree.nodes())} nodes in the graph.")

There are 3563 nodes in the graph.


In [68]:
# Add nodes from the nodes_not_in_edges Series to the academic_tree graph
academic_tree.add_nodes_from(nodes_not_in_edges)

In [69]:
# There should now be 3,563 plus 14 nodes in the graph
print(f"There are {len(academic_tree.nodes())} nodes in the graph.")

There are 3577 nodes in the graph.


---

## Determine the year and category of Nobel Prizes

In [70]:
# A regular expression to extract the category of nobel prize and the year
# A Nobel prize annotation is indicated by:
# lines that end with parenthesis, followed by either P, C, M or E and then a four digit year.
# If the the string ends with a parenthesis containing C followed by a four digit year, the category is Chemistry
# If the the string ends with a parenthesis containing P followed by a four digit year, the category is Physics
# If the the string ends with a parenthesis containing M followed by a four digit year, the category is Medicine
# If the the string ends with a parenthesis containing E followed by a four digit year, the category is Economics
nobel_pattern = re.compile(r'^(.*)\s\(([PCME])(\d{4})\)')

In [71]:
# Test the regular expression with some examples
test_string = 'Hans von Euler-Chelpin (C1929)'
match = nobel_pattern.match(test_string)
if match:
    print(f"{test_string} has a Nobel prize annotation")
    name = match.group(1)
    category = match.group(2)
    year = int(match.group(3))
    print(f"Name: {name}, Category: {category}, Year: {year}")
else:
    print(f"{test_string} does not have a Nobel prize annotation")

test_string = 'Gustav Zeuner'
match = nobel_pattern.match(test_string)
if match:
    print(f"{test_string} has a Nobel prize annotation")
    name = match.group(1)
    category = match.group(2)
    year = int(match.group(3))
    print(f"Name: {name}, Category: {category}, Year: {year}")
else:
    print(f"{test_string} does not have a Nobel prize annotation")

Hans von Euler-Chelpin (C1929) has a Nobel prize annotation
Name: Hans von Euler-Chelpin, Category: C, Year: 1929
Gustav Zeuner does not have a Nobel prize annotation


In [72]:
# Add Nobel Prize attribute
for node in academic_tree.nodes:
    academic_tree.nodes[node]['nobel'] = {}
    # If the name matches the Nobel pattern, add the Nobel attribute
    match = nobel_pattern.match(node)
    if match:
        academic_tree.nodes[node]['nobel']['award'] = True
        academic_tree.nodes[node]['nobel']['category'] = match.group(2)
        academic_tree.nodes[node]['nobel']['year'] = int(match.group(3))
    else:
        academic_tree.nodes[node]['nobel']['award'] = False
        academic_tree.nodes[node]['nobel']['category'] = 'NA'
        academic_tree.nodes[node]['nobel']['year'] = 0

In [73]:
# Wilhelm Rontgen is correctly listed as a laureate
academic_tree.nodes['Wilhelm Rontgen (P1901)']

{'nobel': {'award': True, 'category': 'P', 'year': 1901}}

In [74]:
# Test with a node who is not connected to an edge
academic_tree.nodes['Aleksey Yekimov (C2023)']

{'nobel': {'award': True, 'category': 'C', 'year': 2023}}

In [75]:
# 'Victor Regnault' is not a laureate
academic_tree.nodes['Victor Regnault']

{'nobel': {'award': False, 'category': 'NA', 'year': 0}}

In [76]:
# Print the number of nodes in the family tree where nobel.award is True
print(f"{len(academic_tree.nodes())}: scientists in the family tree")
print(f"Of whom, {len([node for node in academic_tree.nodes if academic_tree.nodes[node]['nobel']['award']])} have won a Nobel Prize")

3577: scientists in the family tree
Of whom, 736 have won a Nobel Prize


---

## Get the descendants of John Strutt

In [77]:
# Function to get descendants of a node
def get_descendants(graph, root):
    descendants = set()
    queue = [root]

    while queue:
        current_node = queue.pop(0)
        descendants.add(current_node)
        for successor in graph.successors(current_node):
            if successor not in descendants:
                queue.append(successor)
                
    return descendants

In [78]:
# Get descendants of the 'John Strutt (P1904)'
jonh_strutt_descendants = get_descendants(academic_tree, 'John Strutt (P1904)')

In [79]:
list(jonh_strutt_descendants)[:5]

['Didier Queloz (P2019)',
 'Malcolm Crawford',
 'Horace Watson',
 'Emilio Segre (P1959)',
 'Richard Bersohn']

In [80]:
# Test if 'Joseph Thomson (P1906)' is a descendant of 'John Strutt (P1904)'
'Joseph Thomson (P1906)' in jonh_strutt_descendants

True

In [81]:
'Galileo Galilei' in jonh_strutt_descendants

False

In [82]:
# Add john_strutt_descendants to the graph
for node in academic_tree.nodes:
    if node in jonh_strutt_descendants:
        academic_tree.nodes[node]['john_strutt_descendants'] = True
    else:
        academic_tree.nodes[node]['john_strutt_descendants'] = False

In [83]:
# Joseph Thomson (P1906) is correctly listed as a descendant of John Strutt
print(json.dumps(academic_tree.nodes['Joseph Thomson (P1906)'], indent=2))

{
  "nobel": {
    "award": true,
    "category": "P",
    "year": 1906
  },
  "john_strutt_descendants": true
}


In [84]:
# Galileo Galilei is not a descendant of John Strutt
print(json.dumps(academic_tree.nodes['Galileo Galilei'], indent=2))

{
  "nobel": {
    "award": false,
    "category": "NA",
    "year": 0
  },
  "john_strutt_descendants": false
}


In [85]:
# Find all connected components (weakly connected) in the graph
connected_components = nx.weakly_connected_components(academic_tree)

In [86]:
connected_components_list = list(connected_components)

In [87]:
len(connected_components_list)

33

In [88]:
# Analyze the size of each connected component
print([len(component) for component in connected_components_list])

[3518, 2, 3, 2, 2, 4, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 6, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [89]:
# Print the smaller connected components (Excluding the largest component with 3,476 entries)
connected_components_list[1:]

[{'Aurel Stodola', 'Gustaf Dalen (P1912)'},
 {'Natalia Sozina', 'Vladimir Tuchkevic', 'Zhores Alferov (P2000)'},
 {'Hiroshi Amano (P2014)', 'Isamu Nakasaki (P2014)'},
 {'Jens Skou (C1997)', 'Soren Orskov'},
 {'Archibald Preece',
  'Dan Shechtman (C2011)',
  'David Brandon',
  'Jack Nutting'},
 {'Patrick Manson', 'Ronald Ross (M1902)'},
 {'Frederick Banting (M1923)', 'John MacLeod (M1923)'},
 {'Henry Christian', 'William Murphy (M1934)'},
 {'Alexander Fleming (M1945)', 'Almroth Wright'},
 {'Andre Cournand (M1956)', 'Dickinson Richards'},
 {'George Hitchings', 'Gertrude Elion (M1988)'},
 {'James Fairbairn', 'Youyou Tu (M2015)', 'Zhicen Lou'},
 {'James Chesterton', 'Michael Houghton (M2020)'},
 {'Ragnar Frisch (E1969)', 'Trygve Haavelmo (E1989)'},
 {'Gerard Debreu (E1983)', 'Maurice Allais (E1988)'},
 {'Christopher Pissarides (E2010)',
  'Franklin Giddings',
  'Gabriel Tarde',
  'Michio Morishima',
  'Shotaro Yoneda',
  'Yasuma Takada'},
 {'M.M. Bousquet', 'Pierre Agostini (P2023)'},
 {'A

In [90]:
largest_component = connected_components_list[0]

In [91]:
len(largest_component)

3518

In [92]:
# Find isolated nodes in the academic_tree
isolated_nodes = list(nx.isolates(academic_tree))

In [93]:
# Check if there are any isolated nodes
if isolated_nodes:
    print(f"There are {len(isolated_nodes)} isolated nodes: {isolated_nodes}")
else:
    print("There are no isolated nodes in the academic_tree.")

There are 14 isolated nodes: ['Leo Esaki (P1973)', 'Jack Kilby (P2000)', 'Shuji Nakamura (P2014)', 'Syokuro Manabe (P2021)', 'Hideki Shirakawa (C2000)', 'Koichi Tanaka (C2002)', 'Yves Chauvin (C2005)', 'Niels Finsen (M1903)', 'Antonio Moniz (M1949)', 'Godfrey Hounsfield (M1979)', 'Robin Warren (M2005)', 'Barry Marshall (M2005)', 'Peter Ratcliffe (M2019)', 'Aleksey Yekimov (C2023)']


In [94]:
'Aurel Stodola' in largest_component

False

In [95]:
'Galileo Galilei' in largest_component

True

In [96]:
'John Strutt (P1904)' in largest_component

True

---

## Add an annotation to show if a node is in the Main Family tree

In [97]:
# Add annotation to nodes in academic_tree to indicate if they are in the largest connected component
for node in academic_tree.nodes:
    if node in largest_component:
        academic_tree.nodes[node]['main_family'] = True
    else:
        academic_tree.nodes[node]['main_family'] = False

In [98]:
# Joseph Thomson won the Physics Nobel Prize in 1906
# He is in the main family
# He is a descendant of John Strutt
print(json.dumps(academic_tree.nodes['Joseph Thomson (P1906)'], indent=2))

{
  "nobel": {
    "award": true,
    "category": "P",
    "year": 1906
  },
  "john_strutt_descendants": true,
  "main_family": true
}


In [99]:
print(json.dumps(academic_tree.nodes['Jens Skou (C1997)'], indent=2))

{
  "nobel": {
    "award": true,
    "category": "C",
    "year": 1997
  },
  "john_strutt_descendants": false,
  "main_family": false
}


---

## Export code to JSON for D3

In [106]:
# Generate a new graph for each connected_component and export 
# nodes and links to a JSON file for each connected_component
output = []

for i, component in enumerate(connected_components_list):
    subgraph = academic_tree.subgraph(component)
    nodes = [
        {
            "id": person, "nobel": academic_tree.nodes[person].get('nobel', {}), 
            "john_strutt_descendants": academic_tree.nodes[person].get('john_strutt_descendants', False),
            "main_family": academic_tree.nodes[person].get('main_family', False),
        } for person in subgraph.nodes]
    links = [{"source": source, "target": target} for source, target in subgraph.edges]
    data = {"nodes": nodes, "links": links}
    output.append(data)

In [107]:
print(json.dumps(output[1]['nodes'], indent=2))

[
  {
    "id": "Gustaf Dalen (P1912)",
    "nobel": {
      "award": true,
      "category": "P",
      "year": 1912
    },
    "john_strutt_descendants": false,
    "main_family": false
  },
  {
    "id": "Aurel Stodola",
    "nobel": {
      "award": false,
      "category": "NA",
      "year": 0
    },
    "john_strutt_descendants": false,
    "main_family": false
  }
]


In [108]:
print(json.dumps(output[1]['links'], indent=2))

[
  {
    "source": "Aurel Stodola",
    "target": "Gustaf Dalen (P1912)"
  }
]


In [109]:
# Save the data to a JSON file
d3_file_path = './data/nobel-tree-full.json'

In [110]:
with open(d3_file_path, 'w') as file:
    json.dump(output, file)