In [29]:
pip install pandas plotly numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [31]:
pip install nbformat statsmodels

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [32]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import nbformat


## Import Data

In [33]:
df_2015 = pd.read_csv("data/2015.csv")
df_2016 = pd.read_csv("data/2016.csv")
df_2017 = pd.read_csv("data/2017.csv")
df_2018 = pd.read_csv("data/2018.csv")
df_2019 = pd.read_csv("data/2019.csv")

## Preprocess Data

In [34]:
## Lowercase all country names
df_2015.Country = df_2015.Country.str.lower()
df_2016.Country = df_2016.Country.str.lower()
df_2017.Country = df_2017.Country.str.lower()
df_2018['Country or region'] = df_2018['Country or region'].str.lower()
df_2019['Country or region'] = df_2019['Country or region'].str.lower()

In [35]:
df_2018.dropna(inplace=True)

In [36]:
countries = {}
li = list(df_2015['Country']) + list(df_2016['Country']) + list(df_2017['Country']) + list(df_2018['Country or region']) + list(df_2019['Country or region'])

for country in li:
    if country not in countries:
        countries[country] = 1
    else:
        countries[country] += 1

country_list = [x for x in countries.keys() if countries[x] == 5]
rejects = [x for x in countries.keys() if countries[x] < 5]
print(len(country_list))


140


In [37]:
## Print the Rejects
rejects.sort()
for country in rejects:
    print(f"{country.capitalize()} ({countries[country]})")

Angola (4)
Belize (3)
Central african republic (4)
Comoros (3)
Djibouti (1)
Gambia (1)
Hong kong (4)
Hong kong s.a.r., china (1)
Laos (4)
Lesotho (4)
Macedonia (4)
Mozambique (4)
Namibia (4)
North cyprus (3)
North macedonia (1)
Northern cyprus (2)
Oman (1)
Puerto rico (1)
Somalia (4)
Somaliland region (2)
South sudan (4)
Sudan (4)
Suriname (2)
Swaziland (2)
Taiwan (4)
Taiwan province of china (1)
Trinidad & tobago (2)
Trinidad and tobago (3)
United arab emirates (4)


In [38]:
## Remove countries that are not present in all years
df_2015.drop(['Standard Error', 'Dystopia Residual', 'Region'], axis=1, inplace=True)
df_2015.drop(df_2015[df_2015['Country'].isin(rejects)].index, inplace=True)
df_2016.drop(['Lower Confidence Interval', 'Upper Confidence Interval', 'Dystopia Residual', 'Region'], axis=1, inplace=True)
df_2016.drop(df_2016[df_2016['Country'].isin(rejects)].index, inplace=True)
df_2017.drop(['Whisker.high', 'Whisker.low', 'Dystopia.Residual'], axis=1, inplace=True)
df_2017.drop(df_2017[df_2017['Country'].isin(rejects)].index, inplace=True)
df_2018.drop(df_2018[df_2018['Country or region'].isin(rejects)].index, inplace=True)
df_2019.drop(df_2019[df_2019['Country or region'].isin(rejects)].index, inplace=True)

In [39]:
## ReArrange Columns
df_2015 = df_2015[['Country', 'Happiness Rank', 'Happiness Score', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity']]
df_2016 = df_2016[['Country', 'Happiness Rank', 'Happiness Score', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity']]
df_2017 = df_2017[['Country', 'Happiness.Rank', 'Happiness.Score', 'Economy..GDP.per.Capita.', 'Family', 'Health..Life.Expectancy.', 'Freedom', 'Trust..Government.Corruption.', 'Generosity']]
df_2018 = df_2018[['Country or region', 'Overall rank', 'Score', 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption', 'Generosity']]
df_2019 = df_2019[['Country or region', 'Overall rank', 'Score', 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption', 'Generosity']]


## Rename Columns
for df in [df_2015, df_2016, df_2017, df_2018, df_2019]:
    df.columns = ['Country', 'Rank', 'Score', 'GDPPC', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity']

## Reset Index
for df in [df_2015, df_2016, df_2017, df_2018, df_2019]:
    df.reset_index(drop=True, inplace=True)

In [40]:
df_2015.head()

Unnamed: 0,Country,Rank,Score,GDPPC,Family,Health,Freedom,Trust,Generosity
0,switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678
1,iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363
2,denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139
3,norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699
4,canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811


In [41]:
df_2016.head()

Unnamed: 0,Country,Rank,Score,GDPPC,Family,Health,Freedom,Trust,Generosity
0,denmark,1,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171
1,switzerland,2,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083
2,iceland,3,7.501,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678
3,norway,4,7.498,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895
4,finland,5,7.413,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492


In [42]:
df_2017.head()

Unnamed: 0,Country,Rank,Score,GDPPC,Family,Health,Freedom,Trust,Generosity
0,norway,1,7.537,1.616463,1.533524,0.796667,0.635423,0.315964,0.362012
1,denmark,2,7.522,1.482383,1.551122,0.792566,0.626007,0.40077,0.35528
2,iceland,3,7.504,1.480633,1.610574,0.833552,0.627163,0.153527,0.47554
3,switzerland,4,7.494,1.56498,1.516912,0.858131,0.620071,0.367007,0.290549
4,finland,5,7.469,1.443572,1.540247,0.809158,0.617951,0.382612,0.245483


In [43]:
df_2018

Unnamed: 0,Country,Rank,Score,GDPPC,Family,Health,Freedom,Trust,Generosity
0,finland,1,7.632,1.305,1.592,0.874,0.681,0.393,0.202
1,norway,2,7.594,1.456,1.582,0.861,0.686,0.340,0.286
2,denmark,3,7.555,1.351,1.590,0.868,0.683,0.408,0.284
3,iceland,4,7.495,1.343,1.644,0.914,0.677,0.138,0.353
4,switzerland,5,7.487,1.420,1.549,0.927,0.660,0.357,0.256
...,...,...,...,...,...,...,...,...,...
135,syria,150,3.462,0.689,0.382,0.539,0.088,0.144,0.376
136,rwanda,151,3.408,0.332,0.896,0.400,0.636,0.444,0.200
137,yemen,152,3.355,0.442,1.073,0.343,0.244,0.064,0.083
138,tanzania,153,3.303,0.455,0.991,0.381,0.481,0.097,0.270


In [44]:
df_2019

Unnamed: 0,Country,Rank,Score,GDPPC,Family,Health,Freedom,Trust,Generosity
0,finland,1,7.769,1.340,1.587,0.986,0.596,0.393,0.153
1,denmark,2,7.600,1.383,1.573,0.996,0.592,0.410,0.252
2,norway,3,7.554,1.488,1.582,1.028,0.603,0.341,0.271
3,iceland,4,7.494,1.380,1.624,1.026,0.591,0.118,0.354
4,netherlands,5,7.488,1.396,1.522,0.999,0.557,0.298,0.322
...,...,...,...,...,...,...,...,...,...
135,malawi,150,3.410,0.191,0.560,0.495,0.443,0.089,0.218
136,yemen,151,3.380,0.287,1.163,0.463,0.143,0.077,0.108
137,rwanda,152,3.334,0.359,0.711,0.614,0.555,0.411,0.217
138,tanzania,153,3.231,0.476,0.885,0.499,0.417,0.147,0.276


In [45]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(df_2015.Country), y=list(df_2015["Score"]), mode='markers', name='2015'))
fig.add_trace(go.Scatter(x=list(df_2016.Country), y=list(df_2016["Score"]), mode='markers', name='2016'))
fig.add_trace(go.Scatter(x=list(df_2017.Country), y=list(df_2017["Score"]), mode='markers', name='2017'))
fig.add_trace(go.Scatter(x=list(df_2018.Country), y=list(df_2019["Score"]), mode='markers', name='2019'))
fig.add_trace(go.Scatter(x=list(df_2019.Country), y=list(df_2018["Score"]), mode='markers', name='2018'))
fig.update_layout(title='Happiness Score by Country', xaxis_title='Country', yaxis_title='Happiness Score', width=1400, height=800)

In [46]:
# Map years to dataframes
dfs = {2015: df_2015, 2016: df_2016, 2017: df_2017, 2018: df_2018, 2019: df_2019}

# Create new dataframes with a 'Year' column and concatenate them
df_new = pd.concat([df.assign(Year=year) for year, df in dfs.items()])

# Create the choropleth map
fig = px.choropleth(df_new, 
                    locations="Country", 
                    locationmode='country names', 
                    color="Score",
                    hover_name="Country", 
                    animation_frame="Year", 
                    color_continuous_scale=["red", "yellow", "green"],
                    title='Happiness Score by Country', 
                    range_color=[df_new.Score.min(), df_new.Score.max()])
fig.update_layout(width=1400, height=700, updatemenus=[], geo=dict(showocean=True, oceancolor="lightblue"))
fig.update_geos(projection_type="natural earth", showcountries=True, showcoastlines=True, showland=True, landcolor="lightgray")
fig.show()

In [47]:
## Correlation Matrix
def getConvoMatrix(df):
    size = df.shape[1]
    
    cont = []
    
    for i in range(0, size):
        here = []
        for j in range(0, size):
            here.append(-1)
        cont.append(here)
    
    # for each feature in one axis.
    for i in range(0, size):
        for j in range(0, size):
            x_sum = 0
            y_sum = 0
            xy_sum = 0
            x_sq_sum = 0
            y_sq_sum = 0
            
            for k in range(0, df.shape[0]):
                x_sum += df.iloc[k, i]
                y_sum += df.iloc[k, j]
                xy_sum += df.iloc[k, i] * df.iloc[k, j]
                x_sq_sum += df.iloc[k, i] ** 2
                y_sq_sum += df.iloc[k, j] ** 2
            
            numerator = (df.shape[0] * xy_sum) - (x_sum * y_sum)
            denominator = ((df.shape[0] * x_sq_sum - (x_sum ** 2)) * (df.shape[0] * y_sq_sum - (y_sum ** 2))) ** 0.5
            
            cont[i][j] = numerator / denominator
    
    return cont

In [48]:
cols = ['Score', 'GDPPC', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity']
df_range = [pd.DataFrame(getConvoMatrix(df_2015[cols])), pd.DataFrame(getConvoMatrix(df_2016[cols])), pd.DataFrame(getConvoMatrix(df_2017[cols])), pd.DataFrame(getConvoMatrix(df_2018[cols])), pd.DataFrame(getConvoMatrix(df_2019[cols]))]
for df in df_range:
    df.columns = cols
    df.index = cols

In [49]:
fig = make_subplots(rows=2, cols=3, subplot_titles=('2015', '2016', '2017', '2018', '2019'), vertical_spacing=0.1)

i = 1
j = 1

for corr in df_range:
    fig.add_trace(go.Heatmap(z=corr, x=corr.columns, y=corr.columns, hoverongaps = False, zmin=-0.2, zmax=1, colorscale='Reds'), row=i, col=j)
    j += 1
    if j == 4:
        j = 1
        i += 1
fig.update_layout(title='Correlation Matrices', height=1000, title_x=0.5)
fig.show()

## Visualize GDP vs Happiness Score

In [50]:
df_range = [x.iloc[0:2, 0:2] for x in df_range]

fig = make_subplots(rows=2, cols=3, subplot_titles=('2015', '2016', '2017', '2018', '2019'), vertical_spacing=0.1)

i = 1
j = 1

for corr in df_range:
    fig.add_trace(go.Heatmap(z=corr, x=corr.columns, y=corr.columns, hoverongaps = False, zmin=0.75, zmax=0.85, colorscale='Greens'), row=i, col=j)
    j += 1
    if j == 4:
        j = 1
        i += 1
fig.update_layout(title='Correlation Matrices', height=1000, title_x=0.5)
fig.show()

In [51]:
fig = make_subplots(rows=5, cols=1, subplot_titles=('2015', '2016', '2017', '2018', '2019'), shared_xaxes=False, shared_yaxes=True, vertical_spacing=0.05)
fig.add_trace(go.Scatter(x=list(df_2015.Country[0:100].str.capitalize()), y=list(df_2015.GDPPC[0:100]), mode='lines+markers', name='2015'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(df_2016.Country[0:100].str.capitalize()), y=list(df_2016.GDPPC[0:100]), mode='lines+markers', name='2016'), row=2, col=1)
fig.add_trace(go.Scatter(x=list(df_2017.Country[0:100].str.capitalize()), y=list(df_2017.GDPPC[0:100]), mode='lines+markers', name='2017'), row=3, col=1)
fig.add_trace(go.Scatter(x=list(df_2018.Country[0:100].str.capitalize()), y=list(df_2018.GDPPC[0:100]), mode='lines+markers', name='2018'), row=4, col=1)
fig.add_trace(go.Scatter(x=list(df_2019.Country[0:100].str.capitalize()), y=list(df_2019.GDPPC[0:100]), mode='lines+markers', name='2019'), row=5, col=1)
fig.update_layout(title='GDPPC by Country for Top 100 Countries Each Year', height=1000, title_x=0.5)
fig.update_yaxes(title_text='GDPPC', title_standoff=15)
fig.update_xaxes(showticklabels=False)
fig.show()

In [52]:
fig = make_subplots(rows=5, cols=1, subplot_titles=('2015', '2016', '2017', '2018', '2019'), shared_xaxes=False, shared_yaxes=True, vertical_spacing=0.03)
fig.add_trace(go.Scatter(x=list(df_2015.Score[0:100]), y=list(df_2015.GDPPC[0:100]), mode='lines+markers', name='2015', text=list(df_2015.Country[0:100])), row=1, col=1)
fig.add_trace(go.Scatter(x=list(df_2016.Score[0:100]), y=list(df_2016.GDPPC[0:100]), mode='lines+markers', name='2016', text=list(df_2016.Country[0:100])), row=2, col=1)
fig.add_trace(go.Scatter(x=list(df_2017.Score[0:100]), y=list(df_2017.GDPPC[0:100]), mode='lines+markers', name='2017', text=list(df_2017.Country[0:100])), row=3, col=1)
fig.add_trace(go.Scatter(x=list(df_2018.Score[0:100]), y=list(df_2018.GDPPC[0:100]), mode='lines+markers', name='2018', text=list(df_2018.Country[0:100])), row=4, col=1)
fig.add_trace(go.Scatter(x=list(df_2019.Score[0:100]), y=list(df_2019.GDPPC[0:100]), mode='lines+markers', name='2019', text=list(df_2019.Country[0:100])), row=5, col=1)
fig.update_layout(title='GDPPC by Score for Top 100 Countries Each Year', height=2000, title_x=0.5)
fig.update_yaxes(title_text='GDPPC', title_standoff=25)
fig.show()

In [53]:
def zscore(df):
    x_sum = 0
    count = 0
    
    for i in df:
        x_sum += i
        count += 1

    mean = x_sum / count
    x_mean_sub = 0
    
    for i in df:
        x_mean_sub += (i - mean)**2

    sigma = (x_mean_sub / (count - 1)) ** 0.5
    z_scores = []
    
    for i in df:
        z_score = (i - mean) / sigma
        z_scores.append(z_score)

    return pd.DataFrame(z_scores)

In [54]:
# Visualize outliers

fig = make_subplots(rows=2, cols=3, subplot_titles=('2015', '2016', '2017', '2018', '2019'), vertical_spacing=0.1)

i = 1
j = 1

df_list = [df_2015, df_2016, df_2017, df_2018, df_2019]
for df in df_list:
    df['Z_score_GDP'] = zscore(df['GDPPC'])

## Threshold for outliers
threshold = 2

outliers_list = [df[(df['Z_score_GDP'] > threshold) | (df['Z_score_GDP'] < -threshold)] for df in df_list]

for outliers in outliers_list:
    fig.add_trace(go.Scatter(x=outliers.GDPPC, y=outliers.Score, text=outliers.Country, mode='markers', marker=dict(size=outliers['Z_score_GDP'].abs()*10)), row=i, col=j)
    j += 1
    if j == 4:
        j = 1
        i += 1
fig.update_layout(title=f'Outliers (Z-score Threshold {threshold})', height=1000, title_x=0.5)
fig.show()

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

i = 1
for df in [df_2015, df_2016, df_2017, df_2018, df_2019]:
    X = df[['GDPPC']]
    y = df['Score']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)
    y_pred = regression_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    fig = px.scatter(x=X_test.squeeze(), y=y_test, trendline="ols", title=f"GDP per capita vs Happiness Score {2014+i}\t MSE: {mse:.2f}, R-squared: {r2:.2f}")
    fig.show()
    i += 1