In [3]:
import re
import json

In [4]:
# Read the content of the MATLAB file
file_path = './data/NobelTree.m'

In [5]:
with open(file_path, 'r') as file:
    matlab_script = file.read()

In [6]:
# Regular expressions to find the successors (mentees) and predecessors (mentors)
# A line which starts with `s0 = {` is pupils / successors
# A line which starts with `p0 = {` in teachers / predecessors
pupil_pattern = re.compile(r's\d+\s*=\s*\{([^\}]+)\};')
teacher_pattern = re.compile(r'p\d+\s*=\s*\{([^\}]+)\};')

In [7]:
# Find all successors and predecessors
pupils = pupil_pattern.findall(matlab_script)
teachers = teacher_pattern.findall(matlab_script)

In [8]:
# Test the first successor and predecessor entry
print(f"Pupil: {pupils[0]}")
print(f"Teacher: {teachers[0]}")

Pupil: 'Wilhelm Rontgen (P1901)' 'Wilhelm Rontgen (P1901)'
Teacher: 'Gustav Zeuner' 'August Kundt'


In this instance, **Wilhelm Rontgen** is the _pupil_ of **Gustav Zeuner** and **August Kundt**.

This is confirmed by [Wilhelm Röntgen's Wikipedia page](https://en.wikipedia.org/wiki/Wilhelm_R%C3%B6ntgen):

> he became a favourite student of Professor August Kundt...

In [15]:
# Initialize an empty list to hold the relationships
relationship_list = []

In [16]:
# Extract the relationships from the matches
for pupil_string, teacher_string in zip(pupils, teachers):
    pupil_list = re.findall(r"'(.*?)'", pupil_string)
    teacher_list = re.findall(r"'(.*?)'", teacher_string)
    for pupil, teacher in zip(pupil_list, teacher_list):
        new_dict = {}
        new_dict['teacher'] = teacher
        new_dict['pupil'] = pupil
        relationship_list.append(new_dict)

In [19]:
relationship_list[:4]

[{'teacher': 'Gustav Zeuner', 'pupil': 'Wilhelm Rontgen (P1901)'},
 {'teacher': 'August Kundt', 'pupil': 'Wilhelm Rontgen (P1901)'},
 {'teacher': 'Victor Regnault', 'pupil': 'Gustav Zeuner'},
 {'teacher': 'Julius Weisbach', 'pupil': 'Gustav Zeuner'}]

In [20]:
# Save the data to a JSON file
output_file_path = './data/nobel-relationship.json'

In [21]:
with open(output_file_path, 'w') as output_file:
    json.dump(relationship_list, output_file, indent=2)

In [12]:
# Extract all unique pupils and teachers

# Inner loop version
# all_pupils = set(individual_pupil for pupil_group in pupils for individual_pupil in re.findall(r"'(.*?)'", pupil_group))
# all_teachers = set(individual_teacher for teacher_group in teachers for individual_teacher in re.findall(r"'(.*?)'", teacher_group))

# Initialize an empty set to hold all unique pupils
all_pupils = set()

# Iterate over each pupil group in the list of pupil strings
for pupil_group in pupils:
    # Extract individual pupil names from the current pupil group
    individual_pupils = re.findall(r"'(.*?)'", pupil_group)
    
    # Add each individual pupil to the set of all pupils
    for individual_pupil in individual_pupils:
        all_pupils.add(individual_pupil)

# Initialize an empty set to hold all unique teachers
all_teachers = set()

# Iterate over each teacher group in the list of teacher strings
for teacher_group in teachers:
    # Extract individual teacher names from the current teacher group
    individual_teachers = re.findall(r"'(.*?)'", teacher_group)
    
    # Add each individual teacher to the set of all teachers
    for individual_teacher in individual_teachers:
        all_teachers.add(individual_teacher)

In [28]:
# Add any missing nodes to the dictionary
all_scientists = all_teachers | all_pupils

In [29]:
print(f"There are {len(list(all_pupils))} teatchers liste")
print(f"There are {len(list(all_teachers))} teachers listed")
print(f"There are {len(list(all_scientists))} individuals listed")

There are 3083 teatchers liste
There are 2977 teachers listed
There are 3386 individuals listed


---