In [1]:
import re
import json
import networkx as nx

In [2]:
# Read the content of the MATLAB file
file_path = './data/NobelTree.m'

In [3]:
with open(file_path, 'r') as file:
    matlab_script = file.read()

In [4]:
# Regular expressions to find the successors (mentees) and predecessors (mentors)
# A line which starts with `s0 = {` is pupils / successors
# A line which starts with `p0 = {` in teachers / predecessors
pupil_pattern = re.compile(r's\d*\s*=\s*\{([^\}]+)\};')
teacher_pattern = re.compile(r'p\d*\s*=\s*\{([^\}]+)\};')

Line 5470

Change: p16 = {'Sophus Jorgensen'};

To: s16 = {'Sophus Jorgensen'};

---

Line 9956

Change: s = {'Robin Warren (M2005)' 'Barry Marshall (M2005)'};

Change: % = {'Robin Warren (M2005)' 'Barry Marshall (M2005)'};

No corresponding p line

In [5]:
# Find all successors and predecessors
pupils = pupil_pattern.findall(matlab_script)
teachers = teacher_pattern.findall(matlab_script)

In [6]:
# Spot test to check if the extraction was successful
test_index = pupils.index("'Wilhelm Rontgen (P1901)' 'Wilhelm Rontgen (P1901)'")
print(f"Pupil: {pupils[test_index]}")
print(f"Teacher: {teachers[test_index]}")

Pupil: 'Wilhelm Rontgen (P1901)' 'Wilhelm Rontgen (P1901)'
Teacher: 'Gustav Zeuner' 'August Kundt'


In this instance, **Wilhelm Rontgen** is the _pupil_ of **Gustav Zeuner** and **August Kundt**.

This is confirmed by [Wilhelm Röntgen's Wikipedia page](https://en.wikipedia.org/wiki/Wilhelm_R%C3%B6ntgen):

> he became a favourite student of Professor August Kundt...

In [7]:
# Initialize an empty list to hold the relationships
relationship_list = []

In [8]:
# Extract the relationships from the matches
for pupil_string, teacher_string in zip(pupils, teachers):
    pupil_list = re.findall(r"'(.*?)'", pupil_string)
    teacher_list = re.findall(r"'(.*?)'", teacher_string)
    for pupil, teacher in zip(pupil_list, teacher_list):
        new_dict = {}
        new_dict['teacher'] = teacher
        new_dict['pupil'] = pupil
        relationship_list.append(new_dict)

In [9]:
relationship_list[:4]

[{'teacher': 'Gustav Zeuner', 'pupil': 'Wilhelm Rontgen (P1901)'},
 {'teacher': 'August Kundt', 'pupil': 'Wilhelm Rontgen (P1901)'},
 {'teacher': 'Victor Regnault', 'pupil': 'Gustav Zeuner'},
 {'teacher': 'Julius Weisbach', 'pupil': 'Gustav Zeuner'}]

In [10]:
# Spot test to check if the conversion was successful
# s25 = {'Claude Hudson' 'Claude Hudson' 'Claude Hudson' 'Claude Hudson'};
# p25 = {'Elmer Loomis' 'Walther Nernst (C1920)' 'Jacobus van t Hoff (C1901)' 'William Magie'};

[relationship for relationship in relationship_list if relationship['pupil'] == 'Claude Hudson']

[{'teacher': 'Elmer Loomis', 'pupil': 'Claude Hudson'},
 {'teacher': 'Walther Nernst (C1920)', 'pupil': 'Claude Hudson'},
 {'teacher': 'Jacobus van t Hoff (C1901)', 'pupil': 'Claude Hudson'},
 {'teacher': 'William Magie', 'pupil': 'Claude Hudson'}]

In [11]:
# Check if each entry in the relationship list is contains both a teacher and a pupil
for relationship in relationship_list:
    if 'teacher' not in relationship or 'pupil' not in relationship:
        print(relationship)

In [12]:
# Create a directed graph
academic_tree = nx.DiGraph()

In [13]:
# Add edges to the graph
for relationship in relationship_list:
    teacher = relationship['teacher']
    pupil = relationship['pupil']
    academic_tree.add_edge(teacher, pupil)

In [14]:
# Test that the graph was created successfully
list(academic_tree.edges())[:5]

[('Gustav Zeuner', 'Wilhelm Rontgen (P1901)'),
 ('Wilhelm Rontgen (P1901)', 'Abram Ioffe'),
 ('Wilhelm Rontgen (P1901)', 'Rudolf Ladenburg'),
 ('Wilhelm Rontgen (P1901)', 'Max Wien'),
 ('Wilhelm Rontgen (P1901)', 'David Keys')]

In [15]:
# A regular expression to extract the category of nobel prize and the year
# A Nobel prize annotation is indicated by:
# lines that end with parenthesis, followed by either P, C, M or E and then a four digit year.
# If the the string ends with a parenthesis containing C followed by a four digit year, the category is Chemistry
# If the the string ends with a parenthesis containing P followed by a four digit year, the category is Physics
# If the the string ends with a parenthesis containing M followed by a four digit year, the category is Medicine
# If the the string ends with a parenthesis containing E followed by a four digit year, the category is Economics
nobel_pattern = re.compile(r'^(.*)\s\(([PCME])(\d{4})\)')

In [16]:
# Test the regular expression with some examples
test_string = 'Hans von Euler-Chelpin (C1929)'
match = nobel_pattern.match(test_string)
if match:
    print(f"{test_string} has a Nobel prize annotation")
    name = match.group(1)
    category = match.group(2)
    year = int(match.group(3))
    print(f"Name: {name}, Category: {category}, Year: {year}")
else:
    print(f"{test_string} does not have a Nobel prize annotation")

test_string = 'Gustav Zeuner'
match = nobel_pattern.match(test_string)
if match:
    print(f"{test_string} has a Nobel prize annotation")
    name = match.group(1)
    category = match.group(2)
    year = int(match.group(3))
    print(f"Name: {name}, Category: {category}, Year: {year}")
else:
    print(f"{test_string} does not have a Nobel prize annotation")

Hans von Euler-Chelpin (C1929) has a Nobel prize annotation
Name: Hans von Euler-Chelpin, Category: C, Year: 1929
Gustav Zeuner does not have a Nobel prize annotation


In [17]:
# Add Nobel Prize attribute
for node in academic_tree.nodes:
    academic_tree.nodes[node]['nobel'] = {}
    # If the name matches the Nobel pattern, add the Nobel attribute
    match = nobel_pattern.match(node)
    if match:
        academic_tree.nodes[node]['nobel']['award'] = True
        academic_tree.nodes[node]['nobel']['category'] = match.group(2)
        academic_tree.nodes[node]['nobel']['year'] = int(match.group(3))
    else:
        academic_tree.nodes[node]['nobel']['award'] = False
        academic_tree.nodes[node]['nobel']['category'] = 'NA'
        academic_tree.nodes[node]['nobel']['year'] = 0

In [18]:
# Wilhelm Rontgen is correctly listed as a laureate
academic_tree.nodes['Wilhelm Rontgen (P1901)']

{'nobel': {'award': True, 'category': 'P', 'year': 1901}}

In [19]:
# 'Victor Regnault' is not a laureate
academic_tree.nodes['Victor Regnault']

{'nobel': {'award': False, 'category': 'NA', 'year': 0}}

In [68]:
# Print the number of nodes in the family tree where nobel.award is True
print(f"{len(academic_tree.nodes())}: scientists in the family tree")
print(f"Of whom, {len([node for node in academic_tree.nodes if academic_tree.nodes[node]['nobel']['award']])} have won a Nobel Prize")


3516: scientists in the family tree
Of whom, 716 have won a Nobel Prize


In [20]:
# Function to get descendants of a node
def get_descendants(graph, root):
    descendants = set()
    queue = [root]

    while queue:
        current_node = queue.pop(0)
        descendants.add(current_node)
        for successor in graph.successors(current_node):
            if successor not in descendants:
                queue.append(successor)
                
    return descendants

In [29]:
# Get descendants of the 'John Strutt (P1904)'
jonh_strutt_descendants = get_descendants(academic_tree, 'John Strutt (P1904)')

In [30]:
list(jonh_strutt_descendants)[:5]

['Douglas Hartree',
 'Ruby Payne-Scott',
 'Chen-Ning Yang (P1957)',
 'Wolfgang Pauli (P1945)',
 'Martin Kamen']

In [33]:
# Test if 'Joseph Thomson (P1906)' is a descendant of 'John Strutt (P1904)'
'Joseph Thomson (P1906)' in jonh_strutt_descendants

True

In [34]:
'Galileo Galilei' in jonh_strutt_descendants

False

In [35]:
# Add john_strutt_descendants to the graph
for node in academic_tree.nodes:
    if node in jonh_strutt_descendants:
        academic_tree.nodes[node]['john_strutt_descendants'] = True
    else:
        academic_tree.nodes[node]['john_strutt_descendants'] = False

In [37]:
# Joseph Thomson (P1906) is correctly listed as a descendant of John Strutt
academic_tree.nodes['Joseph Thomson (P1906)']

{'nobel': {'award': True, 'category': 'P', 'year': 1906},
 'john_strutt_descendants': True}

In [38]:
# Galileo Galilei is not a descendant of John Strutt
academic_tree.nodes['Galileo Galilei']

{'nobel': {'award': False, 'category': 'NA', 'year': 0},
 'john_strutt_descendants': False}

In [52]:
# Find all connected components (weakly connected) in the graph
connected_components = nx.weakly_connected_components(academic_tree)

In [53]:
connected_components_list = list(connected_components)

In [57]:
len(connected_components_list)

17

In [59]:
# Analyze the size of each connected component
[len(component) for component in connected_components_list]

[3476, 2, 3, 2, 2, 4, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 6]

In [64]:
# Print the smaller connected components (Excluding the largest component with 3,476 entries)
connected_components_list[1:]

[{'Aurel Stodola', 'Gustaf Dalen (P1912)'},
 {'Natalia Sozina', 'Vladimir Tuchkevic', 'Zhores Alferov (P2000)'},
 {'Hiroshi Amano (P2014)', 'Isamu Nakasaki (P2014)'},
 {'Jens Skou (C1997)', 'Soren Orskov'},
 {'Archibald Preece',
  'Dan Shechtman (C2011)',
  'David Brandon',
  'Jack Nutting'},
 {'Patrick Manson', 'Ronald Ross (M1902)'},
 {'Frederick Banting (M1923)', 'John MacLeod (M1923)'},
 {'Henry Christian', 'William Murphy (M1934)'},
 {'Alexander Fleming (M1945)', 'Almroth Wright'},
 {'Andre Cournand (M1956)', 'Dickinson Richards'},
 {'George Hitchings', 'Gertrude Elion (M1988)'},
 {'James Fairbairn', 'Youyou Tu (M2015)', 'Zhicen Lou'},
 {'James Chesterton', 'Michael Houghton (M2020)'},
 {'Gerard Debreu (E1983)', 'Maurice Allais (E1988)'},
 {'Ragnar Frisch (E1969)', 'Trygve Haavelmo (E1989)'},
 {'Christopher Pissarides (E2010)',
  'Franklin Giddings',
  'Gabriel Tarde',
  'Michio Morishima',
  'Shotaro Yoneda',
  'Yasuma Takada'}]

In [72]:
largest_component = connected_components_list[0]

In [73]:
len(largest_component)

3476

In [74]:
'Aurel Stodola' in largest_component

False

In [75]:
'Galileo Galilei' in largest_component

True

In [76]:
'John Strutt (P1904)' in largest_component

True

In [77]:
# Add annotation to nodes in academic_tree to indicate if they are in the largest connected component
for node in academic_tree.nodes:
    if node in largest_component:
        academic_tree.nodes[node]['main_family'] = True
    else:
        academic_tree.nodes[node]['main_family'] = False

In [80]:
# Joseph Thomson won the Physics Nobel Prize in 1906
# He is in the main family
# He is a descendant of John Strutt
academic_tree.nodes['Joseph Thomson (P1906)']

{'nobel': {'award': True, 'category': 'P', 'year': 1906},
 'john_strutt_descendants': True,
 'main_family': True}

In [81]:
# 'Jack Nutting' is not in the main family
# They are not a descendant of John Strutt
# They are not a Nobel laureate
academic_tree.nodes['Jack Nutting']


{'nobel': {'award': False, 'category': 'NA', 'year': 0},
 'john_strutt_descendants': False,
 'main_family': False}

In [82]:
# Extract nodes and links to print the graph in D3.js
nodes = [
    {
        "id": person, "nobel": academic_tree.nodes[person].get('nobel', {}), 
        "john_strutt_descendants": academic_tree.nodes[person].get('john_strutt_descendants', False),
        "main_family": academic_tree.nodes[person].get('main_family', False),
    } for person in academic_tree.nodes]
links = [{"source": source, "target": target} for source, target in academic_tree.edges]

In [83]:
nodes[:5]

[{'id': 'Gustav Zeuner',
  'nobel': {'award': False, 'category': 'NA', 'year': 0},
  'john_strutt_descendants': False,
  'main_family': True},
 {'id': 'Wilhelm Rontgen (P1901)',
  'nobel': {'award': True, 'category': 'P', 'year': 1901},
  'john_strutt_descendants': False,
  'main_family': True},
 {'id': 'August Kundt',
  'nobel': {'award': False, 'category': 'NA', 'year': 0},
  'john_strutt_descendants': False,
  'main_family': True},
 {'id': 'Victor Regnault',
  'nobel': {'award': False, 'category': 'NA', 'year': 0},
  'john_strutt_descendants': False,
  'main_family': True},
 {'id': 'Julius Weisbach',
  'nobel': {'award': False, 'category': 'NA', 'year': 0},
  'john_strutt_descendants': False,
  'main_family': True}]

In [84]:
links[:5]

[{'source': 'Gustav Zeuner', 'target': 'Wilhelm Rontgen (P1901)'},
 {'source': 'Wilhelm Rontgen (P1901)', 'target': 'Abram Ioffe'},
 {'source': 'Wilhelm Rontgen (P1901)', 'target': 'Rudolf Ladenburg'},
 {'source': 'Wilhelm Rontgen (P1901)', 'target': 'Max Wien'},
 {'source': 'Wilhelm Rontgen (P1901)', 'target': 'David Keys'}]

In [85]:
# Create the final dictionary
d3_data = {
    "nodes": nodes,
    "links": links
}

In [86]:
# Save the data to a JSON file
d3_file_path = './data/nobel-tree-full.json'

In [87]:
with open(d3_file_path, 'w') as file:
    json.dump(d3_data, file)