In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
file_path = "data/laureates-1000.json"

In [3]:
with open(file_path, 'r') as file:
    data = json.load(file)

laureates = data['laureates']

In [6]:
laureates[:1]

[{'id': '745',
  'knownName': {'en': 'A. Michael Spence', 'se': 'A. Michael Spence'},
  'givenName': {'en': 'A. Michael', 'se': 'A. Michael'},
  'familyName': {'en': 'Spence', 'se': 'Spence'},
  'fullName': {'en': 'A. Michael Spence', 'se': 'A. Michael Spence'},
  'fileName': 'spence',
  'gender': 'male',
  'birth': {'date': '1943-00-00',
   'place': {'city': {'en': 'Montclair, NJ',
     'no': 'Montclair, NJ',
     'se': 'Montclair, NJ'},
    'country': {'en': 'USA', 'no': 'USA', 'se': 'USA'},
    'cityNow': {'en': 'Montclair, NJ',
     'no': 'Montclair, NJ',
     'se': 'Montclair, NJ',
     'sameAs': ['https://www.wikidata.org/wiki/Q678437',
      'https://www.wikipedia.org/wiki/Montclair,_New_Jersey'],
     'latitude': '40.825930',
     'longitude': '-74.209030'},
    'countryNow': {'en': 'USA',
     'no': 'USA',
     'se': 'USA',
     'sameAs': ['https://www.wikidata.org/wiki/Q30'],
     'latitude': '39.828175',
     'longitude': '-98.579500'},
    'continent': {'en': 'North America

In [22]:
# Extract birth dates and names
birth_data = []

for laureate in laureates:
    name = laureate.get('knownName', {}).get('en', '')
    birth_date = laureate.get('birth', {}).get('date')
    prizes = laureate.get('nobelPrizes', [])
    year = prizes[0].get('awardYear')
    category = prizes[0].get('category', {}).get('en', '')
    if birth_date:
        birth_data.append({'name': name, 'birth_date': birth_date, 'number-of-prizes': len(prizes), 'year': year, 'category': category})

In [23]:
birth_data[:5]

[{'name': 'A. Michael Spence',
  'birth_date': '1943-00-00',
  'number-of-prizes': 1,
  'year': '2001',
  'category': 'Economic Sciences'},
 {'name': 'Aage N. Bohr',
  'birth_date': '1922-06-19',
  'number-of-prizes': 1,
  'year': '1975',
  'category': 'Physics'},
 {'name': 'Aaron Ciechanover',
  'birth_date': '1947-10-01',
  'number-of-prizes': 1,
  'year': '2004',
  'category': 'Chemistry'},
 {'name': 'Aaron Klug',
  'birth_date': '1926-08-11',
  'number-of-prizes': 1,
  'year': '1982',
  'category': 'Chemistry'},
 {'name': 'Abdulrazak Gurnah',
  'birth_date': '1948-00-00',
  'number-of-prizes': 1,
  'year': '2021',
  'category': 'Literature'}]

In [24]:
# Convert to DataFrame
birth_df = pd.DataFrame(birth_data)

In [25]:
birth_df.head()

Unnamed: 0,name,birth_date,number-of-prizes,year,category
0,A. Michael Spence,1943-00-00,1,2001,Economic Sciences
1,Aage N. Bohr,1922-06-19,1,1975,Physics
2,Aaron Ciechanover,1947-10-01,1,2004,Chemistry
3,Aaron Klug,1926-08-11,1,1982,Chemistry
4,Abdulrazak Gurnah,1948-00-00,1,2021,Literature


In [32]:
# Work out which laureates share a birthday
# Find duplicate birth dates
duplicated_birth_dates = birth_df[birth_df.duplicated('birth_date', keep=False)].sort_values('birth_date')


In [46]:
duplicated_birth_dates.head()

Unnamed: 0,name,birth_date,number-of-prizes,year,category
27,Albert Gobat,1843-05-21,1,1902,Peace
571,Louis Renault,1843-05-21,1,1907,Peace
716,Pieter Zeeman,1865-05-25,1,1902,Physics
490,John R. Mott,1865-05-25,1,1946,Peace
783,Roger Martin du Gard,1881-03-23,1,1937,Literature


In [38]:
# Save the duplicated birth dates a CSV file
file_path = 'data/duplicated-birth-dates.csv'
duplicated_birth_dates.to_csv(file_path, index=False)

In [52]:
simple_combination = duplicated_birth_dates.groupby('birth_date').agg({'name': lambda x: ', '.join(x),    'number-of-prizes': 'sum',
    'year': list,
    'category': list}).reset_index()

In [54]:
simple_combination

Unnamed: 0,birth_date,name,number-of-prizes,year,category
0,1843-05-21,"Albert Gobat, Louis Renault",2,"[1902, 1907]","[Peace, Peace]"
1,1865-05-25,"Pieter Zeeman, John R. Mott",2,"[1902, 1946]","[Physics, Peace]"
2,1881-03-23,"Roger Martin du Gard, Hermann Staudinger",2,"[1937, 1953]","[Literature, Chemistry]"
3,1895-10-30,"Dickinson W. Richards, Gerhard Domagk",2,"[1956, 1939]","[Physiology or Medicine, Physiology or Medicine]"
4,1918-06-18,"Jerome Karle, Franco Modigliani",2,"[1985, 1985]","[Chemistry, Economic Sciences]"
5,1930-03-15,"Martin Karplus, Zhores Alferov",2,"[2013, 2000]","[Chemistry, Physics]"
6,1930-10-10,"Harold Pinter, Yves Chauvin",2,"[2005, 2005]","[Literature, Chemistry]"
7,1932-10-24,"Pierre-Gilles de Gennes, Robert Mundell",2,"[1991, 1999]","[Physics, Economic Sciences]"
8,1936-01-27,"Samuel C.C. Ting, Barry C. Barish",2,"[1976, 2017]","[Physics, Physics]"
9,1938-03-07,"David Baltimore, Albert Fert",2,"[1975, 2007]","[Physiology or Medicine, Physics]"


In [55]:
# Add a helper column to differentiate between the first and second winner
duplicated_birth_dates['winner'] = duplicated_birth_dates.groupby('birth_date').cumcount() + 1

In [56]:
duplicated_birth_dates.head()

Unnamed: 0,name,birth_date,number-of-prizes,year,category,winner
27,Albert Gobat,1843-05-21,1,1902,Peace,1
571,Louis Renault,1843-05-21,1,1907,Peace,2
716,Pieter Zeeman,1865-05-25,1,1902,Physics,1
490,John R. Mott,1865-05-25,1,1946,Peace,2
783,Roger Martin du Gard,1881-03-23,1,1937,Literature,1


In [57]:
# Pivot the table to separate each winner's details into different columns
pivoted_data = duplicated_birth_dates.pivot(index='birth_date', columns='winner', values=['name', 'year', 'category'])

In [58]:
pivoted_data.head()

Unnamed: 0_level_0,name,name,year,year,category,category
winner,1,2,1,2,1,2
birth_date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1843-05-21,Albert Gobat,Louis Renault,1902,1907,Peace,Peace
1865-05-25,Pieter Zeeman,John R. Mott,1902,1946,Physics,Peace
1881-03-23,Roger Martin du Gard,Hermann Staudinger,1937,1953,Literature,Chemistry
1895-10-30,Dickinson W. Richards,Gerhard Domagk,1956,1939,Physiology or Medicine,Physiology or Medicine
1918-06-18,Jerome Karle,Franco Modigliani,1985,1985,Chemistry,Economic Sciences


In [60]:
# Flatten the MultiIndex columns
pivoted_data.columns = [f'{col[0]}_{col[1]}' for col in pivoted_data.columns]

In [61]:
pivoted_data.head()

Unnamed: 0_level_0,name_1,name_2,year_1,year_2,category_1,category_2
birth_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1843-05-21,Albert Gobat,Louis Renault,1902,1907,Peace,Peace
1865-05-25,Pieter Zeeman,John R. Mott,1902,1946,Physics,Peace
1881-03-23,Roger Martin du Gard,Hermann Staudinger,1937,1953,Literature,Chemistry
1895-10-30,Dickinson W. Richards,Gerhard Domagk,1956,1939,Physiology or Medicine,Physiology or Medicine
1918-06-18,Jerome Karle,Franco Modigliani,1985,1985,Chemistry,Economic Sciences


In [62]:
# Reset the index to have 'birth_date' as a column
pivoted_data = pivoted_data.reset_index()

In [65]:
pivoted_data

Unnamed: 0,birth_date,name_1,name_2,year_1,year_2,category_1,category_2
0,1843-05-21,Albert Gobat,Louis Renault,1902,1907,Peace,Peace
1,1865-05-25,Pieter Zeeman,John R. Mott,1902,1946,Physics,Peace
2,1881-03-23,Roger Martin du Gard,Hermann Staudinger,1937,1953,Literature,Chemistry
3,1895-10-30,Dickinson W. Richards,Gerhard Domagk,1956,1939,Physiology or Medicine,Physiology or Medicine
4,1918-06-18,Jerome Karle,Franco Modigliani,1985,1985,Chemistry,Economic Sciences
5,1930-03-15,Martin Karplus,Zhores Alferov,2013,2000,Chemistry,Physics
6,1930-10-10,Harold Pinter,Yves Chauvin,2005,2005,Literature,Chemistry
7,1932-10-24,Pierre-Gilles de Gennes,Robert Mundell,1991,1999,Physics,Economic Sciences
8,1936-01-27,Samuel C.C. Ting,Barry C. Barish,1976,2017,Physics,Physics
9,1938-03-07,David Baltimore,Albert Fert,1975,2007,Physiology or Medicine,Physics


In [64]:
# Save the duplicated birth dates a CSV file
file_path = 'data/duplicated-birth-dates-2.csv'
pivoted_data.to_csv(file_path, index=False)