In [10]:
import plotly.graph_objects as go
import plotly.io as pio

# Define the updated base template
base = go.layout.Template(
    layout=go.Layout(
        paper_bgcolor='#FFF5CC',
        plot_bgcolor='#FFF5CC',
        height=800,
        width=800 * 1.618,
        xaxis=dict(
            anchor='y',
            showgrid=True,
            gridcolor='#888888',  # Darker grid lines
            tickfont=dict(
                size=24,
                family='Open Sans, sans-serif'  # Use Open Sans font
            ),
            titlefont=dict(
                size=26,
                family='Open Sans, sans-serif'  # Use Open Sans font
            ),
            linecolor='#333333',
            linewidth=2  # Adjust the thickness of the x-axis line
        ),
        yaxis=dict(
            anchor='x',
            showgrid=True,
            gridcolor='#888888',  # Darker grid lines
            tickfont=dict(
                size=24,
                family='Open Sans, sans-serif'
            ),
            titlefont=dict(
                size=26,
                family='Open Sans, sans-serif'
            ),
            linecolor='#333333',
            linewidth=2  # Adjust the thickness of the y-axis line
        ),
        font=dict(
            color='#333333',
            size=28,
            family='Open Sans, sans-serif'
        ),
        # Updated colorway to ensure more distinguishable colors
        colorway=["#470945", # H: Violet
                  "#E67E5A", # H: Orange (Sienna)
                  "#297FB9", # H: Blue (Steel)
                  "#163748", # D: Charcoal
                  "#4F1787", # H: Purple
                  "#EFE04E", # H: Yellow (Maize)
                  "#214F70", # D: Indigo
                  "#DF14AA", # H: Pink (Cerise)
                  "#100B1A", # D: Black
                  "#12C4CF", # H: Teal
                  "#14193D", # D: Space
                  "#CC5500"],# H: Cream
        title=go.layout.Title(
            text='',
            font=dict(
                size=34,
                color='#333333',
                family='Open Sans, sans-serif'
            ),
            x=0.05,
        )
    ),
    data=dict(
        scatter=[
            go.Scatter(
                line=dict(width=3)  # Set the line width for scatter plots
            )
        ]
    )
)

# Register the updated base template
pio.templates['base'] = base
pio.templates.default = 'base'

In [13]:
import requests
response = requests.get('https://api.nobelprize.org/2.1/laureates')
print(response.status_code)
print(response.text)


403
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Nobel Prize</title>
	<style>@font-face{font-display:swap;font-family:Alfred Sans Regular;font-style:normal;font-weight:400;src:url(data:application/vnd.ms-fontobject;base64,bm4AAKZtAAABAAIABAAAAAAABQAAAAAAAAABAJABAAAAAExQBwAAAAEAAAAAAAAAAAAAAJMAACAAAAAAntZZJgAAAAAAAAAAAAAAAAAAAAAAABYAQQBsAGYAcgBlAGQAIABTAGEAbgBzAAAADgBSAGUAZwB1AGwAYQByAAAAGgBWAGUAcgBzAGkAbwBuACAAMQAuADAAMAAwAAAAJgBBAGwAZgByAGUAZAAgAFMAYQBuAHMAIABSAGUAZwB1AGwAYQByAAAAAAADAQ5wAFdYAGP+AFZ8FM3pjMlj2mcKaTm7wz49FWYuKkvmlPaOiPDy8wDyaB4Kd1Pdaxg4JFPkZYG5xnjSU2f6tzEK0WAOGQh3xdr4Eeg2ssT4aZsOWq+BZz8wC/nqBIUYFOJ8ZJMe8hltDsca9DM9PS1WqKbjyjsQ2vN076Kvt7NWk6JlqqUc8Ryxk75StE16tMvn01RoDj8KJ08KYmZDuDywOifMNE3lJNzXv0hp24Szo/CmWSeGFlrdAvmBY9cJWcTBFtZUhosVlJSGtbMTQIFuiMmcYzlU4RN+w5GEPYAXXLoENR4IxPGQ9Dc6X7gMAeF0OmGj4zglkHUoX6Cpp8VQqE2llpOTAa8Vgb7ZCoDx3N7LGwklQi/nCbsvw0oKdHKUP+j8FCpIhggt0RMQYg7aG6nGb5djxQrA3pCHUIURUCDhE4g0UaRKZhaYTBo7Freco1sZZiWetCEJ7/I90k2XYHO3PP0wQim

In [15]:
import requests
import pandas as pd

# Function to fetch all pages of laureates
def fetch_all_laureates():
    base_url = 'https://api.nobelprize.org/2.1/laureates'
    laureates_data = []
    current_page = 0

    while True:
        # Request data from the API with the 'offset' parameter for pagination
        response = requests.get(base_url, params={'offset': current_page})
        data = response.json()

        # Append laureates data from the current page
        laureates_data.extend(data['laureates'])

        # Check if there's more data to fetch
        if 'next' not in data['links']:
            break  # Exit the loop if no more pages
        
        # Move to the next page
        current_page += len(data['laureates'])

    return laureates_data

# Fetch all laureates
all_laureates = fetch_all_laureates()

# List to store processed laureates' data
laureates_data = []

# Processing the data as before
for laureate in all_laureates:
    # Extract top-level laureate information
    laureate_id = laureate.get('id', '')
    known_name = laureate.get('knownName', {}).get('en', '')
    given_name = laureate.get('givenName', {}).get('en', '')
    family_name = laureate.get('familyName', {}).get('en', '')
    full_name = laureate.get('fullName', {}).get('en', '')
    file_name = laureate.get('fileName', '')
    gender = laureate.get('gender', '')
    
    # Extract birth information
    birth_info = laureate.get('birth', {})
    birth_date = birth_info.get('date', '')
    birth_country = birth_info.get('place', {}).get('country', {}).get('en', '')
    
    # Extract death information if available
    death_info = laureate.get('death', {})
    death_date = death_info.get('date', '')
    death_country = death_info.get('place', {}).get('country', {}).get('en', '')
    
    # Extract Wikipedia and other links
    wikipedia = laureate.get('wikipedia', {}).get('english', '')
    wikidata = laureate.get('wikidata', '')
    same_as = laureate.get('sameAs', [])
    
    # Process Nobel Prize details
    for prize in laureate.get('nobelPrizes', []):
        award_year = prize.get('awardYear', '')
        category = prize.get('category', {}).get('en', '')
        category_full_name = prize.get('categoryFullName', {}).get('en', '')
        sort_order = prize.get('sortOrder', '')
        portion = prize.get('portion', '')
        date_awarded = prize.get('dateAwarded', '')
        prize_status = prize.get('prizeStatus', '')
        motivation = prize.get('motivation', {}).get('en', '')
        prize_amount = prize.get('prizeAmount', '')
        prize_amount_adjusted = prize.get('prizeAmountAdjusted', '')

        # Extract affiliations (university, institution)
        affiliations = prize.get('affiliations', [])
        if affiliations:
            university_affiliation = affiliations[0].get('name', {}).get('en', '')
        else:
            university_affiliation = ''

        # Calculate age when prize was awarded
        age = ''
        if birth_date and award_year:
            birth_year = int(birth_date.split('-')[0])
            award_year_int = int(award_year)
            age = award_year_int - birth_year
        
        # Append the laureate data combined with prize data
        laureates_data.append({
            'id': laureate_id,
            'knownName': known_name,
            'givenName': given_name,
            'familyName': family_name,
            'fullName': full_name,
            'fileName': file_name,
            'gender': gender,
            'birth_date': birth_date,
            'birth_country': birth_country,
            'death_date': death_date,
            'death_country': death_country,
            'wikipedia': wikipedia,
            'wikidata': wikidata,
            'sameAs': same_as,
            'awardYear': award_year,
            'category': category,
            'categoryFullName': category_full_name,
            'sortOrder': sort_order,
            'portion': portion,
            'dateAwarded': date_awarded,
            'prizeStatus': prize_status,
            'motivation': motivation,
            'prizeAmount': prize_amount,
            'prizeAmountAdjusted': prize_amount_adjusted,
            'university_affiliation': university_affiliation,
            'age': age
        })

# Create a consolidated DataFrame
# d_laureates = pd.DataFrame(laureates_data)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [16]:
d_laureates = pd.read_csv('d_laureates.csv')

In [17]:
# Count the number of living laureates (where death_date is null)
living_laureates = d_laureates[d_laureates['death_date'].isnull()]

# Get the count of living laureates
count_living = living_laureates['id'].nunique()  # Assuming 'id' is unique for each laureate

# Display the result
print(f"Total number of living laureates: {count_living}")

Total number of living laureates: 324


In [18]:
import pandas as pd
from IPython.display import display, HTML

# Filter the living laureates (where death_date is null)
living_laureates = d_laureates[d_laureates['death_date'].isnull()]

# Remove rows where all the name-related fields ('knownName', 'givenName', 'familyName') are null or empty
living_laureates_cleaned = living_laureates.dropna(subset=['knownName', 'givenName', 'familyName'], how='all')

# Alternatively, if some names may be empty strings (""), we can use:
living_laureates_cleaned = living_laureates_cleaned[
    living_laureates_cleaned[['knownName', 'givenName', 'familyName']].apply(lambda x: x.str.strip().astype(bool)).any(axis=1)
]

# Select relevant columns to display (e.g., id, full name, birth country)
living_laureates_display = living_laureates_cleaned[['id', 'knownName', 'givenName', 'familyName', 'birth_country']]

# Convert DataFrame to HTML and make it scrollable
html_table = living_laureates_display.to_html(index=False, classes='table table-striped', escape=False)

# Style for the table to make it scrollable
scrollable_html = f"""
<div style="height: 400px; overflow-y: scroll; width: 100%; border: 1px solid black;">
    {html_table}
</div>
"""

# Display the scrollable table
display(HTML(scrollable_html))

# Get the count of living laureates
count_living = living_laureates_cleaned['id'].nunique()
print(f"Total number of living laureates: {count_living}")


id,knownName,givenName,familyName,birth_country
68,Chen Ning Yang,Chen Ning,Yang,China
373,James Watson,James,Watson,USA
95,Leon N. Cooper,Leon N.,Cooper,USA
97,Leo Esaki,Leo,Esaki,Japan
98,Ivar Giaever,Ivar,Giaever,Norway
99,Brian D. Josephson,Brian D.,Josephson,United Kingdom
406,David Baltimore,David,Baltimore,USA
536,Mairead Corrigan,Mairead,Corrigan,Northern Ireland
106,Samuel C.C. Ting,Samuel C.C.,Ting,USA
412,Andrew V. Schally,Andrew V.,Schally,Poland


Total number of living laureates: 297


In [19]:
import pandas as pd

# Convert relevant columns to the appropriate data types and sort by 'id' and 'sortOrder'
def process_d_laureates(df):
    # Convert 'id' and 'awardYear' to integers
    df['id'] = pd.to_numeric(df['id'], errors='coerce').astype('Int64')
    df['awardYear'] = pd.to_numeric(df['awardYear'], errors='coerce').astype('Int64')

    # Convert 'birth_date', 'death_date', and 'dateAwarded' to datetime
    df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')
    df['death_date'] = pd.to_datetime(df['death_date'], errors='coerce')
    df['dateAwarded'] = pd.to_datetime(df['dateAwarded'], errors='coerce')

    # Convert 'prizeAmount', 'prizeAmountAdjusted', and 'age' to numeric
    df['prizeAmount'] = pd.to_numeric(df['prizeAmount'], errors='coerce')
    df['prizeAmountAdjusted'] = pd.to_numeric(df['prizeAmountAdjusted'], errors='coerce')
    df['age'] = pd.to_numeric(df['age'], errors='coerce')

    # Other columns can be kept as strings
    df['knownName'] = df['knownName'].astype(str)
    df['givenName'] = df['givenName'].astype(str)
    df['familyName'] = df['familyName'].astype(str)
    df['fullName'] = df['fullName'].astype(str)
    df['fileName'] = df['fileName'].astype(str)
    df['gender'] = df['gender'].astype(str)

    # Sort by 'id' and 'sortOrder'
    df = df.sort_values(by=['awardYear', 'category', 'id', 'sortOrder'], ascending=[True, True, True, True])
    
    return df

# Process the d_laureates DataFrame
d_laureates = process_d_laureates(d_laureates)

# Save the processed DataFrame as a CSV
d_laureates.to_csv('d_laureates.csv', index=False)

In [6]:
import pandas as pd
from IPython.display import display, HTML

# Function to display the DataFrame as a scrollable HTML table with custom ordering
def display_ordered_table(df, order_by=None, ascending=True):
    # If columns are specified for ordering, sort the DataFrame by those columns
    if order_by:
        df = df.sort_values(by=order_by, ascending=ascending)
    
    # Convert DataFrame to HTML
    html_table = df.to_html(index=False, classes='table table-striped', escape=False)
    
    # Scrollable HTML with fixed height and border
    scrollable_html = f"""
    <div style="height: 400px; overflow-y: scroll; width: 100%; border: 1px solid black;">
        {html_table}
    </div>
    """
    
    # Display the scrollable table
    display(HTML(scrollable_html))

# Example of how to call the function to order by 'awardYear' and 'category'
display_ordered_table(d_laureates, order_by=['awardYear', 'category', 'id', 'sortOrder'], ascending=True)

id,knownName,givenName,familyName,fullName,fileName,gender,birth_date,birth_country,death_date,death_country,wikipedia,wikidata,sameAs,awardYear,category,categoryFullName,sortOrder,portion,dateAwarded,prizeStatus,motivation,prizeAmount,prizeAmountAdjusted,university_affiliation,age
160,Jacobus H. van 't Hoff,Jacobus H.,van 't Hoff,Jacobus Henricus van 't Hoff,hoff,male,1852-08-30,the Netherlands,1911-03-01,Germany,https://en.wikipedia.org/wiki/Jacobus_Henricus_van_'t_Hoff,"{'id': 'Q102822', 'url': 'https://www.wikidata.org/wiki/Q102822'}","[https://www.wikidata.org/wiki/Q102822, https://en.wikipedia.org/wiki/Jacobus_Henricus_van_'t_Hoff]",1901,Chemistry,The Nobel Prize in Chemistry,1,1,1901-11-12,received,in recognition of the extraordinary services he has rendered by the discovery of the laws of chemical dynamics and osmotic pressure in solutions,150782,9704878,Berlin University,49.0
569,Sully Prudhomme,Sully,Prudhomme,Sully Prudhomme,prudhomme,male,1839-03-16,France,1907-09-07,France,https://en.wikipedia.org/wiki/Sully_Prudhomme,"{'id': 'Q42247', 'url': 'https://www.wikidata.org/wiki/Q42247'}","[https://www.wikidata.org/wiki/Q42247, https://en.wikipedia.org/wiki/Sully_Prudhomme]",1901,Literature,The Nobel Prize in Literature,1,1,1901-11-14,received,"in special recognition of his poetic composition, which gives evidence of lofty idealism, artistic perfection and a rare combination of the qualities of both heart and intellect",150782,9704878,,62.0
462,Henry Dunant,Henry,Dunant,Jean Henry Dunant,dunant,male,1828-05-08,Switzerland,1910-10-30,Switzerland,https://en.wikipedia.org/wiki/Henry_Dunant,"{'id': 'Q12091', 'url': 'https://www.wikidata.org/wiki/Q12091'}","[https://www.wikidata.org/wiki/Q12091, https://en.wikipedia.org/wiki/Henry_Dunant]",1901,Peace,The Nobel Peace Prize,1,1/2,1901-12-10,received,for his humanitarian efforts to help wounded soldiers and create international understanding,150782,9704878,,73.0
463,Frédéric Passy,Frédéric,Passy,Frédéric Passy,passy,male,1822-05-20,France,1912-06-12,France,https://en.wikipedia.org/wiki/Frédéric_Passy,"{'id': 'Q180409', 'url': 'https://www.wikidata.org/wiki/Q180409'}","[https://www.wikidata.org/wiki/Q180409, https://en.wikipedia.org/wiki/Frédéric_Passy]",1901,Peace,The Nobel Peace Prize,2,1/2,1901-12-10,received,"for his lifelong work for international peace conferences, diplomacy and arbitration",150782,9704878,,79.0
1,Wilhelm Conrad Röntgen,Wilhelm Conrad,Röntgen,Wilhelm Conrad Röntgen,rontgen,male,1845-03-27,Prussia,1923-02-10,Germany,https://en.wikipedia.org/wiki/Wilhelm_Röntgen,"{'id': 'Q35149', 'url': 'https://www.wikidata.org/wiki/Q35149'}","[https://www.wikidata.org/wiki/Q35149, https://en.wikipedia.org/wiki/Wilhelm_Röntgen]",1901,Physics,The Nobel Prize in Physics,1,1,1901-11-12,received,in recognition of the extraordinary services he has rendered by the discovery of the remarkable rays subsequently named after him,150782,9704878,Munich University,56.0
293,Emil von Behring,Emil,von Behring,Emil Adolf von Behring,behring,male,1854-03-15,Prussia,1917-03-31,Germany,https://en.wikipedia.org/wiki/Emil_Adolf_von_Behring,"{'id': 'Q76425', 'url': 'https://www.wikidata.org/wiki/Q76425'}","[https://www.wikidata.org/wiki/Q76425, https://en.wikipedia.org/wiki/Emil_Adolf_von_Behring]",1901,Physiology or Medicine,The Nobel Prize in Physiology or Medicine,1,1,1901-10-30,received,"for his work on serum therapy, especially its application against diphtheria, by which he has opened a new road in the domain of medical science and thereby placed in the hands of the physician a victorious weapon against illness and deaths",150782,9704878,Marburg University,47.0
161,Emil Fischer,Emil,Fischer,Hermann Emil Fischer,fischer,male,1852-10-09,Prussia,1919-07-15,Germany,https://en.wikipedia.org/wiki/Hermann_Emil_Fischer,"{'id': 'Q70554', 'url': 'https://www.wikidata.org/wiki/Q70554'}","[https://www.wikidata.org/wiki/Q70554, https://en.wikipedia.org/wiki/Hermann_Emil_Fischer]",1902,Chemistry,The Nobel Prize in Chemistry,1,1,1902-11-11,received,in recognition of the extraordinary services he has rendered by his work on sugar and purine syntheses,141847,9129789,Berlin University,50.0
571,Theodor Mommsen,Theodor,Mommsen,Christian Matthias Theodor Mommsen,mommsen,male,1817-11-30,Schleswig,1903-11-01,Germany,https://en.wikipedia.org/wiki/Theodor_Mommsen,"{'id': 'Q25351', 'url': 'https://www.wikidata.org/wiki/Q25351'}","[https://www.wikidata.org/wiki/Q25351, https://en.wikipedia.org/wiki/Theodor_Mommsen]",1902,Literature,The Nobel Prize in Literature,1,1,1902-11-13,received,"the greatest living master of the art of historical writing, with special reference to his monumental work, A history of Rome",141847,9129789,,85.0
464,Élie Ducommun,Élie,Ducommun,Élie Ducommun,ducommun,male,1833-02-19,Switzerland,1906-12-07,Switzerland,https://en.wikipedia.org/wiki/Élie_Ducommun,"{'id': 'Q122368', 'url': 'https://www.wikidata.org/wiki/Q122368'}","[https://www.wikidata.org/wiki/Q122368, https://en.wikipedia.org/wiki/Élie_Ducommun]",1902,Peace,The Nobel Peace Prize,1,1/2,1902-12-10,received,for his untiring and skilful directorship of the Bern Peace Bureau,141847,9129789,,69.0
465,Albert Gobat,Albert,Gobat,Charles Albert Gobat,gobat,male,1843-05-21,Switzerland,1914-03-16,Switzerland,https://en.wikipedia.org/wiki/Charles_Albert_Gobat,"{'id': 'Q179458', 'url': 'https://www.wikidata.org/wiki/Q179458'}","[https://www.wikidata.org/wiki/Q179458, https://en.wikipedia.org/wiki/Charles_Albert_Gobat]",1902,Peace,The Nobel Peace Prize,2,1/2,1902-12-10,received,for his eminently practical administration of the Inter-Parliamentary Union,141847,9129789,,59.0


In [7]:
import pandas as pd
import plotly.graph_objects as go

# Function to analyze and plot performance for top X groups (countries/universities)
def analyze_performance(df, group_col, top_x=5, rolling_window=10):
    # Filter out rows where the group column is null or empty
    df = df[df[group_col].notnull() & df[group_col].str.strip().astype(bool)]
    
    # Ensure 'awardYear' is treated as a numeric value and sort accordingly
    df['awardYear'] = pd.to_numeric(df['awardYear'], errors='coerce')
    
    # Group by the chosen column and award year to calculate the number of laureates per year
    performance = df.groupby([group_col, 'awardYear']).size().reset_index(name='laureate_count')
    
    # Calculate rolling average
    performance['rolling_avg'] = performance.groupby(group_col)['laureate_count'].transform(
        lambda x: x.rolling(rolling_window, min_periods=1).mean())

    # Aggregate total laureates by the selected column to identify the top X
    top_performers = performance.groupby(group_col)['laureate_count'].sum().nlargest(top_x).index

    # Filter the performance data to include only the top X performers
    top_performance = performance[performance[group_col].isin(top_performers)]
    
    # Sort the data by 'awardYear' to ensure proper ordering on the x-axis
    top_performance = top_performance.sort_values(by='awardYear')

    # Plot the rolling average for the top X groups
    fig = go.Figure()

    # Add traces for each top group with markers
    for group in top_performance[group_col].unique():
        group_data = top_performance[top_performance[group_col] == group]
        fig.add_trace(go.Scatter(x=group_data['awardYear'], 
                                 y=group_data['rolling_avg'], 
                                 mode='lines+markers',  # Add markers here
                                 name=group))

    # Apply the 'base' template (assuming it's already registered)
    fig.update_layout(template='base', 
                      title=f'Top {top_x} {group_col.capitalize()} Performance Over Time',
                      xaxis_title='Award Year', 
                      yaxis_title='Rolling Average of Laureates')

    # Show the plot
    fig.show()

# Example usage:
# For top 5 countries (passing 'birth_country' directly as group_col):
analyze_performance(d_laureates, group_col='birth_country', top_x=5, rolling_window=1)

# For top 3 universities (passing 'university_affiliation' directly as group_col):
analyze_performance(d_laureates, group_col='university_affiliation', top_x=3, rolling_window=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['awardYear'] = pd.to_numeric(df['awardYear'], errors='coerce')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [8]:
analyze_performance(d_laureates, group_col='university_affiliation', top_x=5, rolling_window =1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [9]:
import pandas as pd
from IPython.display import display, HTML

# Group by birth_country and count the number of laureates for each country
laureates_by_country = d_laureates.groupby('birth_country').size().reset_index(name='laureate_count')

# Sort by the number of laureates (optional)
laureates_by_country = laureates_by_country.sort_values(by='laureate_count', ascending=False)

# Convert DataFrame to HTML and make it scrollable
html_table = laureates_by_country.to_html(index=False, classes='table table-striped', escape=False)

# Style for the table to make it scrollable
scrollable_html = f"""
<div style="height: 400px; overflow-y: scroll; width: 100%; border: 1px solid black;">
    {html_table}
</div>
"""

# Display the scrollable table
display(HTML(scrollable_html))

birth_country,laureate_count
USA,295
United Kingdom,91
Germany,80
France,58
,31
Sweden,30
Japan,28
Canada,21
the Netherlands,19
Switzerland,19


In [21]:
d_country_mapping = pd.read_csv('d_country_mapping.csv')
d_country_colours = pd.read_csv('d_country_colours.csv')

In [22]:
import pandas as pd
from IPython.display import display, HTML

def laureates_cumulative(d_laureates, d_country_mapping, categories=None):
    """
    Calculates the cumulative count of Nobel laureates by country (flag) and award year,
    attaches country information for each flag, and includes the total laureates up to each year.
    If birth_country is null, assigns a white flag.
    
    Parameters:
        d_laureates (DataFrame): Data containing laureates information.
        d_country_mapping (DataFrame): Data containing country code, flag, and other information.
        categories (list, optional): List of categories to filter by. If None, includes all categories.
    
    Returns:
        DataFrame: Cumulative count of laureates by flag, year, and including total laureates by year.
    """
    
    # Filter by category if provided
    if categories is not None:
        d_laureates = d_laureates[d_laureates['category'].isin(categories)]
    
    # Merge d_laureates with d_country_mapping on birth_country to get the flag and other country info
    d_laureates = d_laureates.merge(d_country_mapping[['birth_country', 'flag']], on='birth_country', how='left')
    
    # Replace null values in the flag column with a white flag without using inplace
    d_laureates['flag'] = d_laureates['flag'].fillna('🏳️')
    
    # Group by flag and award year, then calculate the number of laureates for each country and year
    laureates_by_flag = (
        d_laureates
        .groupby(['flag', 'awardYear'])
        .size()
        .reset_index(name='laureate_count')
    )
    
    # Calculate the cumulative count of laureates for each flag up to each year
    laureates_by_flag['cumulative_laureates'] = laureates_by_flag.groupby('flag')['laureate_count'].cumsum()
    
    # Calculate the total cumulative laureates up to each year across all flags
    # First, calculate total laureates per year
    year_totals = laureates_by_flag.groupby('awardYear')['laureate_count'].sum().cumsum()
    
    # Join the cumulative total laureates to the main DataFrame
    d_laureates_cumulative = laureates_by_flag.merge(year_totals.rename('total_laureates'), on='awardYear', how='left')

    # # Display the DataFrame as a scrollable HTML table
    # display(HTML(d_laureates_cumulative.to_html(
    #     notebook=True, 
    #     table_id="scrollableTable", 
    #     classes="table table-striped table-hover",
    #     border=0
    # )))
    
    # # CSS for scrollable table
    # display(HTML("""
    # <style>
    #     #scrollableTable {
    #         max-height: 300px;
    #         overflow-y: scroll;
    #         display: block;
    #     }
    #     table.table {
    #         width: 100%;
    #         border-collapse: collapse;
    #     }
    #     table.table thead {
    #         position: sticky;
    #         top: 0;
    #         background-color: #f9f9f9;
    #     }
    # </style>
    # """))

    return d_laureates_cumulative

# Example usage:
# Calculate cumulative laureates for each flag, filtered by specific categories
d_laureates_cumulative = laureates_cumulative(d_laureates, d_country_mapping, categories=None)


In [25]:
def fill_year_gaps(d_laureates_cumulative):
    """
    Ensures that each country (based on flag) has a continuous record of award years from the first
    award year they won, filling in gaps with the previous year's values, and calculates total laureates.
    
    Parameters:
        d_laureates_cumulative (DataFrame): DataFrame containing cumulative laureates data by flag and year.
        
    Returns:
        DataFrame: Updated DataFrame with continuous years for each flag.
    """
    # Get the range of years for all countries
    min_year = d_laureates_cumulative['awardYear'].min()
    max_year = d_laureates_cumulative['awardYear'].max()
    
    # Create a complete range of years
    all_years = pd.DataFrame({'awardYear': range(min_year, max_year + 1)})
    flag_data_filled = []

    for flag in d_laureates_cumulative['flag'].unique():
        # Filter data for the current flag
        flag_data = d_laureates_cumulative[d_laureates_cumulative['flag'] == flag]
        
        # Get the first year the flag won a prize
        first_award_year = flag_data['awardYear'].min()
        
        # Filter the years to only include from the first award year onwards
        flag_years = all_years[all_years['awardYear'] >= first_award_year]
        
        # Merge with the existing data for the flag to ensure all years are present
        flag_data_complete = flag_years.merge(flag_data, on='awardYear', how='left')
        
        # Forward-fill missing values to carry forward the previous year’s cumulative laureates
        flag_data_complete['cumulative_laureates'] = flag_data_complete['cumulative_laureates'].ffill()
        
        # Fill the flag column to ensure it's populated
        flag_data_complete['flag'] = flag_data_complete['flag'].fillna(flag)
        
        # Append to the list
        flag_data_filled.append(flag_data_complete)
    
    # Concatenate all flag data
    d_laureates_cumulative_filled = pd.concat(flag_data_filled, ignore_index=True)
    
    # Calculate total laureates by year cumulatively across all countries
    cumulative_total_by_year = (
        d_laureates_cumulative_filled.groupby('awardYear')['laureate_count']
        .sum()
        .cumsum()
        .rename('total_laureates')
    )
    
    # Merge the cumulative total laureates back into the main DataFrame
    d_laureates_cumulative_filled = d_laureates_cumulative_filled.merge(
        cumulative_total_by_year, on='awardYear', how='left'
    )
    
    # Remove the original total_laureates_x column if it exists
    d_laureates_cumulative_filled = d_laureates_cumulative_filled.drop(columns=['total_laureates_x'], errors='ignore')
    
    # Rename total_laureates_y to total_laureates for clarity
    d_laureates_cumulative_filled = d_laureates_cumulative_filled.rename(columns={'total_laureates_y': 'total_laureates'})

    # # Display the DataFrame as a scrollable HTML table
    # display(HTML(d_laureates_cumulative_filled.to_html(
    #     notebook=True, 
    #     table_id="scrollableTable", 
    #     classes="table table-striped table-hover",
    #     border=0
    # )))
    
    # # CSS for scrollable table
    # display(HTML("""
    # <style>
    #     #scrollableTable {
    #         max-height: 300px;
    #         overflow-y: scroll;
    #         display: block;
    #     }
    #     table.table {
    #         width: 100%;
    #         border-collapse: collapse;
    #     }
    #     table.table thead {
    #         position: sticky;
    #         top: 0;
    #         background-color: #f9f9f9;
    #     }
    # </style>
    # """))
    
    return d_laureates_cumulative_filled

# Example usage:
d_laureates_cumulative_filled = fill_year_gaps(d_laureates_cumulative)

In [23]:
import plotly.graph_objects as go

def plot_cumulative_shares_with_colours(d_laureates_cumulative, d_country_colours, width=900, height=600, highlight_countries=None):
    """
    Plots a 100% stacked bar chart showing the cumulative share of Nobel laureates
    by country flag for each award year, with an option to highlight multiple countries and make others gray.
    
    Parameters:
        d_laureates_cumulative (DataFrame): DataFrame with cumulative laureates and total laureates by award year and flag.
        d_country_colours (DataFrame): DataFrame with country color mapping.
        width (int): Width of the plot. Default is 900.
        height (int): Height of the plot. Default is 600.
        highlight_countries (list): List of flags for countries to highlight. Other countries will be gray.
    """
    # Merge d_country_colours to assign a color to each country
    d_laureates_cumulative = d_laureates_cumulative.merge(d_country_colours, left_on='flag', right_on='flag', how='left')

    # Calculate the cumulative share for each country (flag)
    d_laureates_cumulative['cumulative_share'] = (
        d_laureates_cumulative['cumulative_laureates'] / d_laureates_cumulative['total_laureates']
    )

    # Define a list of European countries
    european_countries = [
        '🇩🇪', '🇳🇱', '🇫🇷', '🇨🇭', '🇬🇧', '🇸🇪', '🇩🇰', '🇳🇴', '🇦🇹', '🇧🇪', '🇪🇸', '🇮🇹', '🇵🇱', '🇵🇹', '🇷🇺', '🇬🇷', '🇭🇺', '🇮🇪', '🇱🇺', '🇮🇸', '🇷🇴', '🇺🇦', '🇧🇬'
    ]
    usa = '🇺🇸'

    # Create a custom sort order: European countries first, then USA, then others
    def custom_sort(flag):
        if flag in european_countries:
            return (0, european_countries.index(flag))  # European countries appear first
        elif flag == usa:
            return (1, 0)  # USA comes after European countries
        else:
            return (2, 0)  # All other countries come after

    # Order flags by the custom sort order
    d_laureates_cumulative['custom_order'] = d_laureates_cumulative['flag'].apply(custom_sort)
    d_laureates_cumulative = d_laureates_cumulative.sort_values('custom_order')

    # Get unique flags in the custom order
    ordered_flags = d_laureates_cumulative['flag'].unique().tolist()

    # Filter the DataFrame to include only years up to the most recent year in d_laureates_cumulative
    years = sorted(d_laureates_cumulative['awardYear'].unique())

    # Prepare the data for the plot
    fig = go.Figure()

    for flag in ordered_flags:
        flag_data = d_laureates_cumulative[d_laureates_cumulative['flag'] == flag]
        
        # Check if the country is in the highlighted list
        if highlight_countries and flag not in highlight_countries:
            country_color = '#d3d3d3'  # Gray color for all other countries
        else:
            country_color = flag_data['colour'].iloc[0] if flag_data['colour'].notnull().any() else '#ffffff'  # Original color

        fig.add_trace(go.Bar(
            x=flag_data['awardYear'],
            y=flag_data['cumulative_share'],
            name=flag,
            marker_color=country_color,
            hovertemplate='%{y:.1%} for ' + flag + ' in %{x}<extra></extra>'
        ))

    # Update layout for transparency, 100% stacked bar chart with no gaps, and custom dimensions
    fig.update_layout(
        barmode='stack',
        title='',
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
        showlegend=False,
        bargap=0,
        width=width,
        height=height,
        paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
        plot_bgcolor='rgba(0,0,0,0)'    # Transparent plot area
    )

    fig.show()

In [26]:
d_country_mapping = pd.read_csv('d_country_mapping.csv')
d_country_colours = pd.read_csv('d_country_colours.csv')
d_laureates_cumulative = laureates_cumulative(d_laureates, d_country_mapping, categories=None)
d_laureates_cumulative_filled = fill_year_gaps(d_laureates_cumulative)
plot_cumulative_shares_with_colours(d_laureates_cumulative_filled, d_country_colours, width=1200, height=800, highlight_countries=['🇩🇪']) #['🇩🇪', '🇳🇱', '🇫🇷', '🇨🇭', '🇬🇧', '🇸🇪', '🇩🇰', '🇳🇴', '🇦🇹', '🇧🇪', '🇪🇸', '🇮🇹', '🇵🇱', '🇵🇹', '🇷🇺', '🇬🇷', '🇭🇺', '🇮🇪', '🇱🇺', '🇮🇸', '🇷🇴', '🇺🇦', '🇧🇬','🇺🇸'])
plot_cumulative_shares_with_colours(d_laureates_cumulative_filled, d_country_colours, width=1200, height=800, highlight_countries=['🇩🇪', '🇳🇱', '🇫🇷', '🇨🇭', '🇬🇧', '🇸🇪', '🇩🇰', '🇳🇴', '🇦🇹', '🇧🇪', '🇪🇸', '🇮🇹', '🇵🇱', '🇵🇹', '🇷🇺', '🇬🇷', '🇭🇺', '🇮🇪', '🇱🇺', '🇮🇸', '🇷🇴', '🇺🇦', '🇧🇬'])
plot_cumulative_shares_with_colours(d_laureates_cumulative_filled, d_country_colours, width=1200, height=800, highlight_countries=['🇺🇸','🇬🇧','🇩🇪'])

In [40]:
d_prizes_pop = pd.read_csv('d_prizes_pop.csv')
# Calculate prizes per capita
d_prizes_pop['Prizes_per_Capita'] = 1000000 * d_prizes_pop['Number of Prizes'] / d_prizes_pop['Population']

# Sort by 'Prizes_per_Capita' in descending order
d_prizes_pop_sorted = d_prizes_pop.sort_values(by='Prizes_per_Capita', ascending=False)

# Display the sorted DataFrame
d_prizes_pop_sorted[['Country', 'Prizes_per_Capita', 'Number of Prizes']].head(50)


Unnamed: 0,Country,Prizes_per_Capita,Number of Prizes
58,Faroe Islands,18.050542,1
46,Saint Lucia,11.053938,2
4,Sweden,3.220641,34
44,Luxembourg,3.015586,2
8,Switzerland,2.79895,25
9,Austria,2.761694,25
61,Iceland,2.643621,1
16,Norway,2.521425,14
15,Denmark,2.381428,14
20,Ireland,2.179432,11


In [16]:
import plotly.express as px

# Define a list of European countries in your dataset
european_countries = [
    "United Kingdom", "Germany", "France", "Sweden", "Russia/Soviet Union", "Switzerland", "Austria", "Netherlands",
    "Italy", "Hungary", "Poland", "Denmark", "Norway", "Ireland", "Belgium", "Spain", "Czech Republic", "Finland",
    "Romania", "Ukraine", "Belarus", "Croatia", "Lithuania", "Bosnia and Herzegovina", "Greece", "Portugal", 
    "Luxembourg", "Turkey", "Iceland", "Cyprus", "Latvia", "Slovenia", "North Macedonia"
]

# Filter the dataframe for European countries
d_europe = d_prizes_pop[d_prizes_pop['Country'].isin(european_countries)].copy()

# Define the bins and labels for the new ranges
bins = [0, 10, 25, 50, d_europe['Number of Prizes'].max()]
labels = ['0-10', '11-25', '26-50', '50+']

# Create a new column in d_europe with the bucket labels
d_europe.loc[:, 'Prize Bucket'] = pd.cut(d_europe['Number of Prizes'], bins=bins, labels=labels)

# Define a color scale for the buckets
color_scale = {
    '0-10': '#fff5cc',    # 
    '11-25': '#efe04e',   # Yellow
    '26-50': '#e67e5a',   # Orange
    '50+': '#4f1787'      # Dark Red
}

# Map the color scale to the Prize Bucket column
d_europe.loc[:, 'Color'] = d_europe['Prize Bucket'].map(color_scale)

# Create the choropleth map
fig = px.choropleth(
    d_europe,
    locations="Country",
    locationmode="country names",
    color="Prize Bucket",
    color_discrete_map=color_scale,
    title="Nobel Prizes by European Country",
    labels={"Number of Prizes": "Number of Nobel Prizes"},
    scope="europe"  # Focus on Europe
)

# Update layout to remove color bar and improve visualization
fig.update_layout(
    width=1800,  # Set the width of the plot
    height=1200,  # Set the height of the plot
    geo=dict(
        showframe=False,
        showcoastlines=True,
        coastlinecolor="DarkGrey",
        projection_type="mercator",  # Set the projection type
        center=dict(lat=55, lon=15),  # Center the map
        lonaxis=dict(range=[-30.0, 60.0]),  # Adjust the longitude range to make the map wider
        lataxis=dict(range=[30.0, 75.0])    # Adjust the latitude range to keep focus on Europe
    ),
    coloraxis_showscale=False,  # Fully hides the color bar
    showlegend=False  # Remove the legend
)

# Show the plot
fig.show()