<a href="https://colab.research.google.com/github/chris-creditdesign/nobels-data-investigation/blob/main/noble_migration_investigation_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Nobel Migration investigation 2025

In [95]:
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt

### Download the data from the [Nobel Prize API](https://www.nobelprize.org/about/developer-zone-2/)

In [5]:
def fetch_nobel_data(url, prams=None):
    response = requests.get(url, params=prams)
    response.raise_for_status()
    return response.json()

In [17]:
def download_complete_dataset(endpoint):
    base_url = f"https://api.nobelprize.org/2.1/{endpoint}"
    all_data = []
    offset = 0
    limit = 25
    # params = {"offset": offset, "limit": limit}
    # data = fetch_nobel_data(base_url, params)

    # print(data)

    while True:
        params = {"offset": offset, "limit": limit}
        data = fetch_nobel_data(base_url, params)
        all_data.extend(data[endpoint])

        # Check if there is a next page
        if "next" not in data["links"]:
            break
        offset += limit

    return all_data

In [24]:
data_laureates = download_complete_dataset("laureates")

In [27]:
print(f"There are {len(data_laureates)} laureates")

There are 1004 laureates


In [28]:
data_prizes = download_complete_dataset("nobelPrizes")

In [29]:
print(f"There are {len(data_prizes)} prizes")

There are 676 prizes


## Format and filter the Laureates data

We should only include data for science prizes, exclude peace, literature and economics.

For each Laureate we should collect their id, name, birthdate, birth country and birth country now.

For each of their prizes, we should collect the award year, category, and country and country now of their first affilitaion.

In [90]:
print(json.dumps(data_laureates[0], indent=2))

{
  "id": "745",
  "knownName": {
    "en": "A. Michael Spence",
    "se": "A. Michael Spence"
  },
  "givenName": {
    "en": "A. Michael",
    "se": "A. Michael"
  },
  "familyName": {
    "en": "Spence",
    "se": "Spence"
  },
  "fullName": {
    "en": "A. Michael Spence",
    "se": "A. Michael Spence"
  },
  "fileName": "spence",
  "gender": "male",
  "birth": {
    "date": "1943-00-00",
    "place": {
      "city": {
        "en": "Montclair, NJ",
        "no": "Montclair, NJ",
        "se": "Montclair, NJ"
      },
      "country": {
        "en": "USA",
        "no": "USA",
        "se": "USA"
      },
      "cityNow": {
        "en": "Montclair, NJ",
        "no": "Montclair, NJ",
        "se": "Montclair, NJ",
        "sameAs": [
          "https://www.wikidata.org/wiki/Q678437",
          "https://www.wikipedia.org/wiki/Montclair,_New_Jersey"
        ],
        "latitude": "40.825930",
        "longitude": "-74.209030"
      },
      "countryNow": {
        "en": "USA",
    

In [112]:
reformatted_laureates = []
for laureate in data_laureates:
    reformatted_prizes = []
    if "nobelPrizes" in laureate:
        for prize in laureate["nobelPrizes"]:
            if prize["category"]["en"] in ["Peace", "Literature", "Economic Sciences"]:
              break
            else:
              if "affiliations" in prize and len(prize["affiliations"]) > 0:
                affiliation = prize["affiliations"][0]
                reformatted_prizes.append({
                    "awardYear": prize["awardYear"],
                    "category": prize["category"]["en"],
                    "affiliation_country": affiliation["country"]["en"] if "country" in affiliation and "en" in affiliation["country"] else "N/A",
                    "affiliation_countryNow": affiliation["countryNow"]["en"] if "countryNow" in affiliation and "en" in affiliation["countryNow"] else "N/A"
                })

              reformatted_laureates.append({
                  "id": laureate["id"],
                  "name": laureate["knownName"]["en"] if "knownName" in laureate and "en" in laureate["knownName"] else "N/A",
                  "birth_date": laureate["birth"]["date"] if "birth" in laureate and "date" in laureate["birth"] else "N/A",
                  "birth_country": laureate["birth"]["place"]["country"]["en"] if "birth" in laureate and "place" in laureate["birth"] and "country" in laureate["birth"]["place"] and "en" in laureate["birth"]["place"]["country"] else "N/A",
                  "birth_countryNow": laureate["birth"]["place"]["countryNow"]["en"] if "birth" in laureate and "place" in laureate["birth"] and "countryNow" in laureate["birth"]["place"] and "en" in laureate["birth"]["place"]["countryNow"] else "N/A",
                  "prizes": reformatted_prizes
              })

In [113]:
print(json.dumps(reformatted_laureates[4], indent=2))

{
  "id": "843",
  "name": "Ada E. Yonath",
  "birth_date": "1939-06-22",
  "birth_country": "British Mandate of Palestine",
  "birth_countryNow": "Israel",
  "prizes": [
    {
      "awardYear": "2009",
      "category": "Chemistry",
      "affiliation_country": "Israel",
      "affiliation_countryNow": "Israel"
    }
  ]
}


In [114]:
collected_data = []

for laureate in reformatted_laureates:
    for prize in laureate["prizes"]:
        collected_data.append({
            "id": laureate["id"],
            "name": laureate["name"],
            "birth_country": laureate["birth_country"],
            "birth_countryNow": laureate["birth_countryNow"],
            "awardYear": prize["awardYear"],
            "category": prize["category"],
            "affiliation_country": prize["affiliation_country"],
            "affiliation_countryNow": prize["affiliation_countryNow"]
        })

In [115]:
print(json.dumps(collected_data[0], indent=2))

{
  "id": "102",
  "name": "Aage N. Bohr",
  "birth_country": "Denmark",
  "birth_countryNow": "Denmark",
  "awardYear": "1975",
  "category": "Physics",
  "affiliation_country": "Denmark",
  "affiliation_countryNow": "Denmark"
}


In [116]:
print(f"There are {len(collected_data)} prizes.")

There are 655 prizes.


In [96]:
df_collected_data = pd.DataFrame(collected_data)
df_collected_data.to_csv("nobel_prize_migration_data.csv", index=False)

## Start to analise the data

In [117]:
df = pd.DataFrame(collected_data)

In [98]:
df.head()

Unnamed: 0,id,name,birth_country,birth_countryNow,awardYear,category,affliation_country,affliation_countryNow
0,102,Aage N. Bohr,Denmark,Denmark,1975,Physics,Denmark,Denmark
1,779,Aaron Ciechanover,British Protectorate of Palestine,Israel,2004,Chemistry,Israel,Israel
2,259,Aaron Klug,Lithuania,Lithuania,1982,Chemistry,United Kingdom,United Kingdom
3,114,Abdus Salam,India,Pakistan,1979,Physics,Italy,Italy
4,843,Ada E. Yonath,British Mandate of Palestine,Israel,2009,Chemistry,Israel,Israel


In [118]:
total_prizes = df.shape[0]
print(f"The total number of prizes is {total_prizes}")

The total number of prizes is 655


In [119]:
migration_count = df[df['birth_countryNow'] != df['affiliation_countryNow']].shape[0]
print(f"Number of entries where birth_countryNow is not equal to affiliation_countryNow: {migration_count}")

Number of entries where birth_countryNow is not equal to affiliation_countryNow: 212


In [120]:
print(f"The proportion of entries where the laureate has migrated is {round(migration_count/total_prizes * 100, 1)}%")

The proportion of entries where the laureate has migrated is 32.4%


In [146]:
migrated_prizes_per_year = df[df['birth_countryNow'] != df['affiliation_countryNow']].groupby('awardYear').size().reset_index(name='migration_count')
total_prizes_per_year = df.groupby('awardYear').size().reset_index(name='award_count')
merged_counts = pd.merge(total_prizes_per_year, migrated_prizes_per_year, on='awardYear', how='left')
merged_counts['migration_count'] = merged_counts['migration_count'].fillna(0).astype(int)

In [147]:
merged_counts.tail(10)

Unnamed: 0,awardYear,award_count,migration_count
110,2015,8,3
111,2016,7,4
112,2017,9,5
113,2018,8,0
114,2019,9,3
115,2020,8,2
116,2021,7,3
117,2022,8,1
118,2023,8,5
119,2024,7,2


In [149]:
merged_counts['proportion_migration'] = merged_counts['migration_count'] / merged_counts['award_count']

In [150]:
merged_counts.tail(10)

Unnamed: 0,awardYear,award_count,migration_count,proportion_migration
110,2015,8,3,0.375
111,2016,7,4,0.571429
112,2017,9,5,0.555556
113,2018,8,0,0.0
114,2019,9,3,0.333333
115,2020,8,2,0.25
116,2021,7,3,0.428571
117,2022,8,1,0.125
118,2023,8,5,0.625
119,2024,7,2,0.285714


In [151]:
merged_counts.to_csv("nobel_prize_migration_merged_counts.csv", index=False)