In [1]:
# Day5 - Ranking

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:

# Define file paths and corresponding years
files = {
    '/content/gdrive/My Drive/DataForFigure21WHR2020.xls': 2020,
    '/content/gdrive/My Drive/DataForFigure21WHR2021.xls': 2021,
    '/content/gdrive/My Drive/DataForFigure21WHR2022.xls': 2022,
    '/content/gdrive/My Drive/DataForFigure21WHR2023.xls': 2023,
    '/content/gdrive/My Drive/DataForFigure21WHR2024.xls': 2024
}




# Initialize list to collect cleaned DataFrames
dfs = []

for file, year in files.items():
    df = pd.read_excel(file)

    # Standardize column names based on the World Happiness Report structure
    # Adjust column names if your files differ
    possible_country_cols = ['Country name', 'Country', 'country']
    possible_score_cols = ['Ladder score', 'Happiness Score', 'Score']

    # Find correct column names
    country_col = next(col for col in possible_country_cols if col in df.columns)
    score_col = next(col for col in possible_score_cols if col in df.columns)

    # Subset and rename
    df = df[[country_col, score_col]].copy()
    df.columns = ['Country', 'Happiness Score']
    df['Year'] = year

    dfs.append(df)

# Combine all years into one DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Rank countries within each year
combined_df['Rank'] = combined_df.groupby('Year')['Happiness Score'].rank(method='first', ascending=False)

# Filter for Top 10 in each year
top10_df = combined_df[combined_df['Rank'] <= 10].copy()

# Sort for bump chart plotting
top10_df.sort_values(by=['Year', 'Rank'], inplace=True)

# Preview cleaned data
print(top10_df.head(15))


         Country  Happiness Score  Year  Rank
0        Finland           7.8087  2020   1.0
1        Denmark           7.6456  2020   2.0
2    Switzerland           7.5599  2020   3.0
3        Iceland           7.5045  2020   4.0
4         Norway           7.4880  2020   5.0
5    Netherlands           7.4489  2020   6.0
6         Sweden           7.3535  2020   7.0
7    New Zealand           7.2996  2020   8.0
8        Austria           7.2942  2020   9.0
9     Luxembourg           7.2375  2020  10.0
153      Finland           7.8421  2021   1.0
154      Denmark           7.6195  2021   2.0
155  Switzerland           7.5715  2021   3.0
156      Iceland           7.5539  2021   4.0
157  Netherlands           7.4640  2021   5.0


In [10]:
# prompt: create a visualization that illustrates ranking chart

import plotly.graph_objects as go

fig = go.Figure()

for country in top10_df['Country'].unique():
    country_data = top10_df[top10_df['Country'] == country]
    fig.add_trace(go.Scatter(x=country_data['Year'], y=country_data['Rank'],
                             mode='lines+markers',
                             name=country))

fig.update_layout(title='Happiness Ranking Over Time (Top 10 Countries)',
                  xaxis_title='Year',
                  yaxis_title='Rank (1 is highest)',
                  yaxis_autorange='reversed') # Reverse y-axis for rank

fig.show()


In [13]:
# prompt: can you make it more narrow? and what is the code to generate a .png



fig = go.Figure()

for country in top10_df['Country'].unique():
    country_data = top10_df[top10_df['Country'] == country]
    fig.add_trace(go.Scatter(x=country_data['Year'], y=country_data['Rank'],
                             mode='lines+markers',
                             name=country))

fig.update_layout(title='Happiness Ranking Over Time (Top 10 Countries)',
                  xaxis_title='Year',
                  yaxis_title='Rank (1 is highest)',
                  yaxis_autorange='reversed')

# Save the plot as a PNG image
#fig.write_image("happiness_ranking.png")
fig.show()

In [14]:
# prompt: is it possible to do the chart without half year points?


# ... (Your existing code for data loading and preprocessing) ...

fig = go.Figure()

for country in top10_df['Country'].unique():
    country_data = top10_df[top10_df['Country'] == country]
    fig.add_trace(go.Scatter(x=country_data['Year'], y=country_data['Rank'],
                             mode='lines+markers',
                             name=country))

fig.update_layout(title='Happiness Ranking Over Time (Top 10 Countries)',
                  xaxis_title='Year',
                  yaxis_title='Rank (1 is highest)',
                  yaxis_autorange='reversed')

# Remove half-year points by setting the x-axis tickvals to only the years present
fig.update_xaxes(tickvals=top10_df['Year'].unique())

fig.show()


In [15]:
# prompt: perfect, is it possible to decrease the overall width of the chart and reduce the distance years on the x-aisis?

fig.update_layout(width=800,  # Adjust width as needed
                  title='Happiness Ranking Over Time (Top 10 Countries)',
                  xaxis_title='Year',
                  yaxis_title='Rank (1 is highest)',
                  yaxis_autorange='reversed',
                  xaxis = dict(
                      tickmode = 'linear',
                      tick0 = 2020,
                      dtick = 1
                  ))


In [16]:
# prompt: can you add a caption in the bottom left below the x-axis lablel 'Year' that says 'Source :https://worldhappiness.report/'

fig.add_annotation(
    x=2020,  # Adjust x position as needed
    y=0,  # Position below the x-axis
    xref="x",
    yref="y",
    text="Source :https://worldhappiness.report/",
    showarrow=False,
    xanchor="left",  # Align caption to the left
    yanchor="bottom"  # Align caption to the bottom
)


In [None]:
# prompt: is there a colorlation between the order of the countries in the ledgen and the current year ranking?

import pandas as pd
import plotly.graph_objects as go

# ... (Your existing code for data loading and processing) ...

# Function to check correlation between legend order and ranking
def check_legend_ranking_correlation(df):
    for year in df['Year'].unique():
        year_df = df[df['Year'] == year]
        legend_order = year_df['Country'].unique()
        ranking_order = year_df.sort_values('Rank')['Country'].tolist()

        print(f"Year: {year}")
        print("Legend Order:", legend_order)
        print("Ranking Order:", ranking_order)

        # Check for equality - Simple correlation check
        if legend_order.tolist() == ranking_order:
            print("Legend order and ranking order are the SAME for this year.")
        else:
            print("Legend order and ranking order are DIFFERENT for this year.")
        print("-" * 20)

# Call the function to analyze the correlation
check_legend_ranking_correlation(top10_df)


In [9]:
import pandas as pd
import plotly.express as px

# Assuming 'combined_df' is your DataFrame with 'Year', 'Country', and 'Rank' columns

fig = px.line(combined_df,
              x='Year',
              y='Rank',
              color='Country',
              markers=True,
              title='Top 10 Happiest Countries Over Time')

fig.update_yaxes(autorange='reversed')  # To have rank 1 at the top
fig.show()
