In [1]:
import pandas as pd
import pycountry
import numpy as np
import csv

In [2]:
PATH_HAPPY = "../Data/world-happiness-report.csv"
PATH_HAPPY_2021 = "../Data/world-happiness-report-2021.csv"
PATH_ALCOHOL = "../Data/alcohol-consumption.csv"

# Data Processing

In [3]:
df_happy_2021 = pd.read_csv(PATH_HAPPY_2021)
df_happy_2021.head()

Unnamed: 0,Country name,Regional indicator,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,Western Europe,7.842,0.032,7.904,7.78,10.775,0.954,72.0,0.949,-0.098,0.186,2.43,1.446,1.106,0.741,0.691,0.124,0.481,3.253
1,Denmark,Western Europe,7.62,0.035,7.687,7.552,10.933,0.954,72.7,0.946,0.03,0.179,2.43,1.502,1.108,0.763,0.686,0.208,0.485,2.868
2,Switzerland,Western Europe,7.571,0.036,7.643,7.5,11.117,0.942,74.4,0.919,0.025,0.292,2.43,1.566,1.079,0.816,0.653,0.204,0.413,2.839
3,Iceland,Western Europe,7.554,0.059,7.67,7.438,10.878,0.983,73.0,0.955,0.16,0.673,2.43,1.482,1.172,0.772,0.698,0.293,0.17,2.967
4,Netherlands,Western Europe,7.464,0.027,7.518,7.41,10.932,0.942,72.4,0.913,0.175,0.338,2.43,1.501,1.079,0.753,0.647,0.302,0.384,2.798


In [4]:
df_happy = pd.read_csv(PATH_HAPPY)
df_happy

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.724,7.370,0.451,50.80,0.718,0.168,0.882,0.518,0.258
1,Afghanistan,2009,4.402,7.540,0.552,51.20,0.679,0.190,0.850,0.584,0.237
2,Afghanistan,2010,4.758,7.647,0.539,51.60,0.600,0.121,0.707,0.618,0.275
3,Afghanistan,2011,3.832,7.620,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.710,0.268
...,...,...,...,...,...,...,...,...,...,...,...
1944,Zimbabwe,2016,3.735,7.984,0.768,54.40,0.733,-0.095,0.724,0.738,0.209
1945,Zimbabwe,2017,3.638,8.016,0.754,55.00,0.753,-0.098,0.751,0.806,0.224
1946,Zimbabwe,2018,3.616,8.049,0.775,55.60,0.763,-0.068,0.844,0.710,0.212
1947,Zimbabwe,2019,2.694,7.950,0.759,56.20,0.632,-0.064,0.831,0.716,0.235


### Happiness data concatenation

Let's take a look at the range of the historical data.

In [None]:
df_happy["year"].describe()

Historical data goes from 2005 to 2020. 

We will first harominze the names of the columns in the 2021 set, in order to be ablte to merge them easily with the historical data.

In [None]:
# let's include the year in the current dataset
df_happy_2021["year"] = 2021

In [None]:
rename_columns = {'Ladder score':'Life Ladder', 'Logged GDP per capita':'Log GDP per capita',
'Healthy life expectancy':'Healthy life expectancy at birth'}

In [None]:
df_happy_2021 = df_happy_2021.rename(columns = rename_columns)

Let's now concatenate the two sets. For consistency reasons, we'll take only the columns already present into the historical set and the 2021 set. Note that in this case, this does not exclude any observed statistics, only those columns which were computed directly from the others. 

In [None]:
df = pd.concat([df_happy, df_happy_2021], join="inner", ignore_index=True)

In [None]:
df

### Adding country codes 

To ensure consistency and completeness between both our datasets and external datasets further used for visualization,
we will add country codes following the ISO standards and rename the countries according to those same ISO standards.

In [None]:
# We built here a list of country names that were not standard and their standard conversion. 
names_conversion = {"Czech Republic": "Czechia", 
                    "Taiwan Province of China": "Taiwan, Province of China", 
                    "South Korea": "Korea, Republic of", 
                    "Moldova": "Moldova, Republic of", 
                    "Bolivia": "Bolivia, Plurinational State of", 
                    "Russia" : "Russian Federation", 
                    "Hong Kong S.A.R. of China": "Hong Kong", 
                    "Vietnam": "Viet Nam", 
                    "Congo (Brazzaville)": "Congo",
                    "Congo (Kinshasa)": "Congo, The Democratic Republic of the",
                    "Ivory Coast": "Côte d'Ivoire",
                    "Laos": "Lao People's Democratic Republic", 
                    "Venezuela": "Venezuela, Bolivarian Republic of",
                    "Iran": "Iran, Islamic Republic of", 
                    "Palestinian Territories": "Palestine, State of", 
                    "Swaziland": "Eswatini",
                    "Syria": "Syrian Arab Republic",
                    "Tanzania": "Tanzania, United Republic of"}

In [None]:
df["Country name"] = df["Country name"].replace(names_conversion)

We will use the pycountry library and retrieve the iso_2 and iso_3 codes for country names. 

In [None]:
input_countries = [a for a in df["Country name"].to_numpy()]
countries_2 = {}
countries_3 = {}
for country in pycountry.countries:
    countries_2[country.name] = country.alpha_2
    countries_3[country.name] = country.alpha_3
    
codes_2 = [countries_2.get(country, 'Unknown code') for country in input_countries]
codes_3 = [countries_3.get(country, 'Unknown code') for country in input_countries]

df["iso_2"] = codes_2
df["iso_3"] = codes_3

In [None]:
df[df['iso_3'] == "Unknown code"]

Kosovo and North Cyprus don't benefit of full international recognition and do not have official country codes. 
However, for completion reasons, we will manually input their temporary country codes.
Regarding Somaliland region, most of the data being NaN, we will simply exclude it from our dataset. 

In [None]:
# some iso standards are not yet define. We enter by hand the temporary ones. 
df.loc[df["Country name"] == "Kosovo", "iso_2"] = "XK"
df.loc[df["Country name"] == "Kosovo", "iso_3"] = "XKX"
df.loc[df["Country name"] == "North Cyprus", "iso_2"] = "CTR"
df.loc[df["Country name"] == "North Cyprus", "iso_3"] = "CTR"

In [None]:
df = df[df["Country name"] != "Somaliland region"]

In [None]:
df[df['iso_3'] == "Unknown code"]

Therefore now, all our country codes appear to be included. This will tremendously facilitate 
join operations with other sets. 

In [None]:
df.head()

Let's take a look at the completeness of this dataset. 

In [None]:
df.groupby(['Country name'])['Country name'].count()

We clearly see some years are missing for some data. Let's solve that by adding NaN values lines inbetween. 

In [None]:
# we set an index as "iso_3/Year"
df.set_index(['iso_3', 'year'], inplace=True)
df

In [None]:
# then we reconstruct a combination of all indices and reindex
index = pd.MultiIndex.from_product(df.index.levels)
df = df.reindex(index)

### Including alcohol consumption data

In [None]:
df_alcohol = pd.read_csv(PATH_ALCOHOL)

In [None]:
df_alcohol

In [None]:
df_alcohol["Year"].describe()

Regarding alcohol consumption, our data ranges from 2005 to 2018. However, a quick eyeball shows the data as being relatively incomplete (Afghanistan for example, only contains 2010, 2015 and 2018 data). Once our data will be merged with the happiness dataframe, we will have NaN values for many years. However we should be able to easily interpolate, using the prebuilt `df.interpolate()` function. 

In [None]:
# we do define the same index as the previous set, to facilitate the Join operation
df_alcohol = df_alcohol.rename(columns = {'Year':'year', 'Code':'iso_3', "Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)": "Alcohol consumption"})
df_alcohol.set_index(['year', 'iso_3'], inplace=True)

In [None]:
df_alcohol

In [None]:
df = df.merge(right = df_alcohol, how = "left", on = ["iso_3", "year"])
df

### Interpolate

We will interpolate all the possible values and fill the gaps with the closest values, linearly. Note that for countries where only NaN values are available 

In [None]:
df = df.groupby("iso_3").apply(
    lambda group: group.interpolate(method = "linear", limit_direction = 
                                    "both").fillna(method = "ffill").fillna(method = "bfill"))

### Outliers

In order to identify outliers, we will use an "outlier rate", meaning the ratio of `Life Ladder/Log GDP per capita`. The higher the ratio, the happier the country compared to what it should be based on its inhabitants financial ressources alone. 

In [None]:
df["Happiness/GDP cap."] = df["Life Ladder"]/df["Log GDP per capita"]
df

As a quick view, our main outliers are shown below. 

In [None]:
df.sort_values("Happiness/GDP cap.", ascending = False).head()

# Export 

### General data

Now we will export those data year by year in a csv format. The goal is to be easily 
reusable without further computation in our website.

In [None]:
export_path = "../Data/Processed/"

In [None]:
# for each year, we put all the information in a different csv file 
# for year in df.index.levels[1]:
    # df[np.in1d(df.index.get_level_values(1), [year])].to_csv(export_path+"data_"+str(year)+".csv") 

### ToP 10 data

For the purpose of the first graph (top 10 happiness countries and their time evolution), we generate a subset of the previous dataframe for every year, including only the top 10 happiest countries and their Life Ladder. 
This will be usefull for ensuring quick loading of the data in the website.

In [None]:
years = df.index.levels[1] # contains all years, before reseting index
df = df.reset_index() 

In [None]:
top_countries_ever = set([]) # contains the countries that have ever been in the top, no matter the year 
for year in years:
    top_10 = df[df["year"] == year].sort_values("Life Ladder", ascending = False)[:10]
    top_10 = top_10[["iso_2", "Country name", "Life Ladder"]]
    top_countries_ever = top_countries_ever.union(set(np.unique(top_10["Country name"].to_numpy())))
    top_10.to_csv(export_path+"top_10_"+str(year)+".csv", index = False)

We also export a list of all country names present for at least one year in the top 10 happiest countries.

In [None]:
# we just sort the data from the set and export them to csv 
pd.DataFrame(list(top_countries_ever)).sort_values(0).to_csv(export_path+"countries_in_top"+".csv", index = False, header = False)