In [1]:
import json
import pandas as pd

In [2]:
file_path = "data/prizes-1000.json"

with open(file_path, 'r') as file:
    data = json.load(file)

In [4]:
# Print the first prize
print(json.dumps(data['nobelPrizes'][0], indent=2))

{
  "awardYear": "1901",
  "category": {
    "en": "Chemistry",
    "no": "Kjemi",
    "se": "Kemi"
  },
  "categoryFullName": {
    "en": "The Nobel Prize in Chemistry",
    "no": "Nobelprisen i kjemi",
    "se": "Nobelpriset i kemi"
  },
  "dateAwarded": "1901-11-12",
  "prizeAmount": 150782,
  "prizeAmountAdjusted": 10531894,
  "links": [
    {
      "rel": "nobelPrize",
      "href": "https://api.nobelprize.org/2/nobelPrize/che/1901",
      "action": "GET",
      "types": "application/json"
    }
  ],
  "laureates": [
    {
      "id": "160",
      "knownName": {
        "en": "Jacobus H. van 't Hoff"
      },
      "fullName": {
        "en": "Jacobus Henricus van 't Hoff"
      },
      "portion": "1",
      "sortOrder": "1",
      "motivation": {
        "en": "in recognition of the extraordinary services he has rendered by the discovery of the laws of chemical dynamics and osmotic pressure in solutions",
        "se": "s\u00e5som ett erk\u00e4nnande av den utomordentliga f\u00f

In [10]:
# We want to know how many awards were attributed to each country in each year.
# The awards data file gives us the year and a list of laureates associated with the award - but not the country.
# Will need to use the Laureates data file to get the country information.

In [8]:
file_path_laureates = "data/laureates-1000.json"

with open(file_path_laureates, 'r') as file:
    data_laureates = json.load(file)

laureates = data_laureates['laureates']

In [9]:
print(json.dumps(laureates[0], indent=2))

{
  "id": "745",
  "knownName": {
    "en": "A. Michael Spence",
    "se": "A. Michael Spence"
  },
  "givenName": {
    "en": "A. Michael",
    "se": "A. Michael"
  },
  "familyName": {
    "en": "Spence",
    "se": "Spence"
  },
  "fullName": {
    "en": "A. Michael Spence",
    "se": "A. Michael Spence"
  },
  "fileName": "spence",
  "gender": "male",
  "birth": {
    "date": "1943-00-00",
    "place": {
      "city": {
        "en": "Montclair, NJ",
        "no": "Montclair, NJ",
        "se": "Montclair, NJ"
      },
      "country": {
        "en": "USA",
        "no": "USA",
        "se": "USA"
      },
      "cityNow": {
        "en": "Montclair, NJ",
        "no": "Montclair, NJ",
        "se": "Montclair, NJ",
        "sameAs": [
          "https://www.wikidata.org/wiki/Q678437",
          "https://www.wikipedia.org/wiki/Montclair,_New_Jersey"
        ],
        "latitude": "40.825930",
        "longitude": "-74.209030"
      },
      "countryNow": {
        "en": "USA",
    

In [11]:
# Each laureate can have multiple awards.
# Extract each award from the laureates nobelPrizes list.

awards = []

for laureate in laureates:
    for award in laureate['nobelPrizes']:
        awards.append(award)

In [12]:
print(json.dumps(awards[0], indent=2))

{
  "awardYear": "2001",
  "category": {
    "en": "Economic Sciences",
    "no": "\u00d8konomi",
    "se": "Ekonomi"
  },
  "categoryFullName": {
    "en": "The Sveriges Riksbank Prize in Economic Sciences in Memory of Alfred Nobel",
    "no": "Sveriges Riksbanks pris i \u00f8konomisk vitenskap til minne om Alfred Nobel",
    "se": "Sveriges Riksbanks pris i ekonomisk vetenskap till Alfred Nobels minne"
  },
  "sortOrder": "2",
  "portion": "1/3",
  "dateAwarded": "2001-10-10",
  "prizeStatus": "received",
  "motivation": {
    "en": "for their analyses of markets with asymmetric information",
    "se": "f\u00f6r deras analys av marknader med assymetrisk informations"
  },
  "prizeAmount": 10000000,
  "prizeAmountAdjusted": 13927869,
  "affiliations": [
    {
      "name": {
        "en": "Stanford University",
        "no": "Stanford University",
        "se": "Stanford University"
      },
      "nameNow": {
        "en": "Stanford University"
      },
      "city": {
        "en": 

In [34]:
# From the awards list, extract the year and the country of the laureate
# represented by the 'awardYear' and 'affiliations/country/en' keys.

awards_data = []

for award in awards:
    category = award.get('category', {}).get('en')
    if category in ['Literature', 'Peace', 'Economic Sciences']:  # Exclude 'Literature' and 'Peace' prizes
        continue
    year = int(award['awardYear'])
    country = award.get('affiliations', [{}])[0].get('country', {}).get('en', 'Unknown')
    awards_data.append({'year': year, 'country': country})

In [35]:
len(awards_data)

646

In [36]:
# Create a DataFrame from the awards data list.

df = pd.DataFrame(awards_data)

In [37]:
df.head()

Unnamed: 0,year,country
0,1975,Denmark
1,2004,Israel
2,1982,United Kingdom
3,1979,Italy
4,2009,Israel


In [38]:
# Sort the dataframe by the year column.

df = df.sort_values('year')

In [39]:
df.head()

Unnamed: 0,year,country
152,1901,Germany
619,1901,Germany
290,1901,Germany
151,1902,Germany
475,1902,the Netherlands


In [40]:
# For each year, count the number of awards attributed to each country.

df_grouped = df.groupby(['year', 'country']).size().reset_index(name='count')

In [41]:
df_grouped

Unnamed: 0,year,country,count
0,1901,Germany,3
1,1902,Germany,1
2,1902,United Kingdom,1
3,1902,the Netherlands,2
4,1903,Denmark,1
...,...,...,...
351,2022,USA,3
352,2023,Germany,1
353,2023,Hungary,1
354,2023,Sweden,1


In [42]:
df_grouped.to_csv('data/nobel_prizes_by_country.csv', index=False)

In [29]:
# For each year, work out the proportion of awards attributed to each country.

df_grouped['total'] = df_grouped.groupby('year')['count'].transform('sum')
df_grouped['proportion'] = df_grouped['count'] / df_grouped['total']

df_grouped.to_csv('data/nobel_prizes_by_country_proportion.csv', index=False)

In [30]:
df_grouped

Unnamed: 0,year,country,count,total,proportion
0,1901,Germany,3,6,0.500000
1,1901,Unknown,3,6,0.500000
2,1902,Germany,1,7,0.142857
3,1902,United Kingdom,1,7,0.142857
4,1902,Unknown,3,7,0.428571
...,...,...,...,...,...
473,2023,Germany,1,11,0.090909
474,2023,Hungary,1,11,0.090909
475,2023,Sweden,1,11,0.090909
476,2023,USA,6,11,0.545455


In [32]:
# From the df_grouped dataframe show just the awards for the year 2023

df_grouped[df_grouped['year'] == 2023]

Unnamed: 0,year,country,count,total,proportion
473,2023,Germany,1,11,0.090909
474,2023,Hungary,1,11,0.090909
475,2023,Sweden,1,11,0.090909
476,2023,USA,6,11,0.545455
477,2023,Unknown,2,11,0.181818


In [45]:
# For each year, work out the proportion of awards attributed to each country.

df_grouped['total'] = df_grouped.groupby('year')['count'].transform('sum')
df_grouped['proportion'] = df_grouped['count'] / df_grouped['total']

df_grouped.to_csv('data/nobel_prizes_by_country_proportion.csv', index=False)
