## Data Wrangling
The goal is to unify all datasets, filter relevant features, classify by regions.

In [80]:
import pandas as pd 
import pycountry

### Read Data
Dataset: World Happiness 2015 to 2022

Collected from [Kaggle](https://www.kaggle.com/datasets/mathurinache/world-happiness-report?select=2022.csv)

Features of interest: real GDP per capita, social support, healthy life expectancy, freedom to make life choices, generosity, perceptions of corruption

In [81]:
df_2015 = pd.read_csv('../data/raw/HappinessScore2015.csv')
df_2016 = pd.read_csv('../data/raw/HappinessScore2016.csv')
df_2017 = pd.read_csv('../data/raw/HappinessScore2017.csv')
df_2018 = pd.read_csv('../data/raw/HappinessScore2018.csv')
df_2019 = pd.read_csv('../data/raw/HappinessScore2019.csv')
df_2020 = pd.read_csv('../data/raw/HappinessScore2020.csv')
df_2021 = pd.read_csv('../data/raw/HappinessScore2021.csv')
df_2022 = pd.read_csv('../data/raw/HappinessScore2022.csv', decimal=",") # decimals are with comma

### Wrangle Data

##### Explore Data

In [82]:
# Explore Data
print(2015, len(df_2015), sorted(list(df_2015.columns)))
print(2016, len(df_2016), sorted(list(df_2016.columns)))
print(2017, len(df_2017), sorted(list(df_2017.columns)))
print(2018, len(df_2018), sorted(list(df_2018.columns)))
print(2019, len(df_2019), sorted(list(df_2019.columns)))
print(2020, len(df_2020), sorted(list(df_2020.columns)))
print(2021, len(df_2021), sorted(list(df_2021.columns)))
print(2022, len(df_2022), sorted(list(df_2022.columns)))

2015 158 ['Country', 'Dystopia Residual', 'Economy (GDP per Capita)', 'Family', 'Freedom', 'Generosity', 'Happiness Rank', 'Happiness Score', 'Health (Life Expectancy)', 'Region', 'Standard Error', 'Trust (Government Corruption)']
2016 157 ['Country', 'Dystopia Residual', 'Economy (GDP per Capita)', 'Family', 'Freedom', 'Generosity', 'Happiness Rank', 'Happiness Score', 'Health (Life Expectancy)', 'Lower Confidence Interval', 'Region', 'Trust (Government Corruption)', 'Upper Confidence Interval']
2017 155 ['Country', 'Dystopia.Residual', 'Economy..GDP.per.Capita.', 'Family', 'Freedom', 'Generosity', 'Happiness.Rank', 'Happiness.Score', 'Health..Life.Expectancy.', 'Trust..Government.Corruption.', 'Whisker.high', 'Whisker.low']
2018 156 ['Country or region', 'Freedom to make life choices', 'GDP per capita', 'Generosity', 'Healthy life expectancy', 'Overall rank', 'Perceptions of corruption', 'Score', 'Social support']
2019 156 ['Country or region', 'Freedom to make life choices', 'GDP pe

In [83]:
# View datasets to get an overall picture
df_2015.head(3)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204


In [84]:
df_2016.head(3)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137


In [85]:
df_2020.head(3)

Unnamed: 0,Country name,Regional indicator,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,Western Europe,7.8087,0.031156,7.869766,7.747634,10.639267,0.95433,71.900825,0.949172,-0.059482,0.195445,1.972317,1.28519,1.499526,0.961271,0.662317,0.15967,0.477857,2.762835
1,Denmark,Western Europe,7.6456,0.033492,7.711245,7.579955,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489,1.972317,1.326949,1.503449,0.979333,0.66504,0.242793,0.49526,2.432741
2,Switzerland,Western Europe,7.5599,0.035014,7.628528,7.491272,10.979933,0.942847,74.102448,0.921337,0.105911,0.303728,1.972317,1.390774,1.472403,1.040533,0.628954,0.269056,0.407946,2.350267


In [86]:
df_2021.head(3)

Unnamed: 0,Country name,Regional indicator,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,Western Europe,7.842,0.032,7.904,7.78,10.775,0.954,72.0,0.949,-0.098,0.186,2.43,1.446,1.106,0.741,0.691,0.124,0.481,3.253
1,Denmark,Western Europe,7.62,0.035,7.687,7.552,10.933,0.954,72.7,0.946,0.03,0.179,2.43,1.502,1.108,0.763,0.686,0.208,0.485,2.868
2,Switzerland,Western Europe,7.571,0.036,7.643,7.5,11.117,0.942,74.4,0.919,0.025,0.292,2.43,1.566,1.079,0.816,0.653,0.204,0.413,2.839


In [87]:
df_2022.head(3)

Unnamed: 0,RANK,Country,Happiness score,Whisker-high,Whisker-low,Dystopia (1.83) + residual,Explained by: GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption
0,1,Finland,7.821,7.886,7.756,2.518,1.892,1.258,0.775,0.736,0.109,0.534
1,2,Denmark,7.636,7.71,7.563,2.226,1.953,1.243,0.777,0.719,0.188,0.532
2,3,Iceland,7.557,7.651,7.464,2.32,1.936,1.32,0.803,0.718,0.27,0.191


In [88]:
# Understand 'Overall rank', 'Score', 'Happiness Score', 'Dystopia Residual' 
print(list(df_2017['Dystopia.Residual'])[:10])
print(list(df_2017['Happiness.Rank'])[:10])
print(list(df_2017['Happiness.Score'])[:10])
print(list(df_2018['Overall rank'])[:10])
print(list(df_2018['Score'])[:10])
print(list(df_2021['Ladder score in Dystopia'].unique())[:10])

[2.27702665328979, 2.31370735168457, 2.32271528244019, 2.2767162322998, 2.4301815032959, 2.29480409622192, 2.18726444244385, 2.0464563369751, 2.09753799438477, 2.06521081924438]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[7.53700017929077, 7.52199983596802, 7.50400018692017, 7.49399995803833, 7.4689998626709, 7.3769998550415, 7.31599998474121, 7.31400012969971, 7.28399991989136, 7.28399991989136]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[7.632, 7.594, 7.555, 7.495, 7.487, 7.441, 7.328, 7.324, 7.314, 7.272]
[2.43]


*Remarks*
- Relationship between datasets:
    - It seems that from 2020 the columns 'Explained by...' are the ones equivalent to the previous datasets. 
    - Happiness Score = Score = Ladder Score. 
    - Happiness Rank = Overall Rank.
- From 2020 to 2021 there are no records of 'Happiness Rank'. Add it manually.
- From 2015 to 2017 there are no entries for 'Social Support', it will be interpolated.
- From 2018 to 2019 there are no entries for 'Dystopia Residual', it will be interpolated.

##### Organize and Clean Data

In [89]:
# Add rank to 2020 & 2021.The records are already sorted by happiness
df_2020['Happiness Rank'] = df_2020.index + 1
df_2021['Happiness Rank'] = df_2021.index + 1

# Avoid future repeated columns when renaming
unnecesary_columns = ['Generosity', 'Perceptions of corruption', 'Social support', 'Healthy life expectancy']
df_2020 = df_2020.drop(unnecesary_columns, axis=1)
df_2021 = df_2021.drop(unnecesary_columns, axis=1)

In [90]:
# 2015 ['Country', 'Dystopia Residual', 'Economy (GDP per Capita)', 'Family', 'Freedom', 'Generosity', 'Happiness Rank', 'Happiness Score', 'Health (Life Expectancy)', 'Region', 'Standard Error', 'Trust (Government Corruption)']
# 2016 ['Country', 'Dystopia Residual', 'Economy (GDP per Capita)', 'Family', 'Freedom', 'Generosity', 'Happiness Rank', 'Happiness Score', 'Health (Life Expectancy)', 'Lower Confidence Interval', 'Region', 'Trust (Government Corruption)', 'Upper Confidence Interval']
# 2017 ['Country', 'Dystopia.Residual', 'Economy..GDP.per.Capita.', 'Family', 'Freedom', 'Generosity', 'Happiness.Rank', 'Happiness.Score', 'Health..Life.Expectancy.', 'Trust..Government.Corruption.', 'Whisker.high', 'Whisker.low']
# 2018 ['Country or region', 'Freedom to make life choices', 'GDP per capita', 'Generosity', 'Healthy life expectancy', 'Overall rank', 'Perceptions of corruption', 'Score', 'Social support']
# 2019 ['Country or region', 'Freedom to make life choices', 'GDP per capita', 'Generosity', 'Healthy life expectancy', 'Overall rank', 'Perceptions of corruption', 'Score', 'Social support']
# 2020 ['Country name', 'Dystopia + residual', 'Explained by: Freedom to make life choices', 'Explained by: Generosity', 'Explained by: Healthy life expectancy', 'Explained by: Log GDP per capita', 'Explained by: Perceptions of corruption', 'Explained by: Social support', 'Freedom to make life choices', 'Generosity', 'Healthy life expectancy', 'Ladder score', 'Ladder score in Dystopia', 'Logged GDP per capita', 'Perceptions of corruption', 'Regional indicator', 'Social support', 'Standard error of ladder score', 'lowerwhisker', 'upperwhisker']
# 2021 ['Country name', 'Dystopia + residual', 'Explained by: Freedom to make life choices', 'Explained by: Generosity', 'Explained by: Healthy life expectancy', 'Explained by: Log GDP per capita', 'Explained by: Perceptions of corruption', 'Explained by: Social support', 'Freedom to make life choices', 'Generosity', 'Healthy life expectancy', 'Ladder score', 'Ladder score in Dystopia', 'Logged GDP per capita', 'Perceptions of corruption', 'Regional indicator', 'Social support', 'Standard error of ladder score', 'lowerwhisker', 'upperwhisker']
# 2022 ['Country', 'Dystopia (1.83) + residual', 'Explained by: Freedom to make life choices', 'Explained by: GDP per capita', 'Explained by: Generosity', 'Explained by: Healthy life expectancy', 'Explained by: Perceptions of corruption', 'Explained by: Social support', 'Happiness score', 'RANK', 'Whisker-high', 'Whisker-low']

# Features of interest: GDP per capita, social support, healthy life expectancy, freedom to make life choices, generosity, perceptions of corruption
# Country - Dystopia Residual - GPD per Capita - Freedom - Generosity - Healthy life expectancy - Perceptions of corruption - Social support - Happiness Rank - Happines Score
df_2015r = df_2015.rename(columns= {'Economy (GDP per Capita)': 'GDP per capita', 'Health (Life Expectancy)': 'Healthy life expectancy', 'Trust (Government Corruption)': 'Perceptions of corruption'})
df_2016r = df_2016.rename(columns= {'Economy (GDP per Capita)': 'GDP per capita', 'Health (Life Expectancy)': 'Healthy life expectancy', 'Trust (Government Corruption)': 'Perceptions of corruption'})
df_2017r = df_2017.rename(columns= {'Dystopia.Residual': 'Dystopia Residual', 'Economy..GDP.per.Capita.': 'GDP per capita', 'Health..Life.Expectancy.': 'Healthy life expectancy', 'Trust..Government.Corruption.': 'Perceptions of corruption', 'Happiness.Rank': 'Happiness Rank', 'Happiness.Score': 'Happiness Score'})
df_2018r = df_2018.rename(columns= {'Country or region': 'Country', 'Freedom to make life choices': 'Freedom', 'Overall rank': 'Happiness Rank', 'Score': 'Happiness Score'})
df_2019r = df_2019.rename(columns= {'Country or region': 'Country', 'Freedom to make life choices': 'Freedom', 'Overall rank': 'Happiness Rank', 'Score': 'Happiness Score'})
df_2020r = df_2020.rename(columns= {'Country name': 'Country', 'Dystopia + residual': 'Dystopia Residual', 'Explained by: Log GDP per capita': 'GDP per capita', 'Explained by: Freedom to make life choices':'Freedom',  'Explained by: Generosity': 'Generosity', 'Explained by: Healthy life expectancy': 'Healthy life expectancy', 'Explained by: Perceptions of corruption': 'Perceptions of corruption', 'Explained by: Social support': 'Social support', 'Ladder score': 'Happiness Score'})
df_2021r = df_2021.rename(columns= {'Country name': 'Country', 'Dystopia + residual': 'Dystopia Residual', 'Explained by: Log GDP per capita': 'GDP per capita', 'Explained by: Freedom to make life choices':'Freedom', 'Explained by: Generosity': 'Generosity', 'Explained by: Healthy life expectancy': 'Healthy life expectancy', 'Explained by: Perceptions of corruption': 'Perceptions of corruption', 'Explained by: Social support': 'Social support', 'Ladder score': 'Happiness Score'})
df_2022r = df_2022.rename(columns= {'Dystopia (1.83) + residual': 'Dystopia Residual', 'Explained by: GDP per capita': 'GDP per capita', 'Explained by: Freedom to make life choices':'Freedom', 'Explained by: Generosity': 'Generosity', 'Explained by: Healthy life expectancy': 'Healthy life expectancy', 'Explained by: Perceptions of corruption': 'Perceptions of corruption', 'Explained by: Social support': 'Social support', 'RANK': 'Happiness Rank', 'Happiness score':'Happiness Score'})

##### Add Features

In [91]:
# Add year to all datasets
df_2015r['Year']= 2015
df_2016r['Year']= 2016
df_2017r['Year']= 2017
df_2018r['Year']= 2018
df_2019r['Year']= 2019
df_2020r['Year']= 2020
df_2021r['Year']= 2021
df_2022r['Year']= 2022

##### Extract Relevant Features 

In [92]:
# Country: Name of the country
# Happiness Rank:Rank of the country based on the Happiness Score 
# Happiness Score: A metric measured in 2016 by asking the sampled people the question: "How would you rate your happiness on a scale of 0 to 10 where 10 is the happiest" 
# Whisker High: Lower Confidence Interval of the Happiness Score 
# Whisker Low: Upper Confidence Interval of the Happiness Score 
# Economy (GDP per Capita): The extent to which GDP contributes to the calculation of the Happiness Score 
# Family: The extent to which Family contributes to the calculation of the Happiness Score 
# Health (Life Expectancy): The extent to which Life expectancy contributed to the calculation of the Happiness Score 
# Freedom: The extent to which Freedom contributed to the calculation of the Happiness Score 
# Trust (Government Corruption): The extent to which Perception of Corruption contributes to Happiness Score 
# Generosity: The extent to which Generosity contributed to the calculation of the Happiness Score 
# Dystopia Residual: The extent to which Dystopia Residual contributed to the calculation of the Happiness Score

# Check common columns accross datasets: Country - Dystopia Residual* - GPD per Capita - Freedom - Generosity - Healthy life expectancy - Perceptions of corruption - Social support* - Happiness Rank - Year
common_features = list(set(df_2015r.columns) & set(df_2016r.columns) & set(df_2017r.columns) & set(df_2018r.columns) & set(df_2019r.columns) & set(df_2020r.columns) & set(df_2021r.columns) & set(df_2022r.columns))
common_features

['Generosity',
 'Perceptions of corruption',
 'Year',
 'Country',
 'Healthy life expectancy',
 'Freedom',
 'Happiness Score',
 'GDP per capita',
 'Happiness Rank']

In [93]:
# Extract relevant features by filtering columns of interest
df_2015f = df_2015r[common_features + ['Dystopia Residual']]
df_2016f = df_2016r[common_features + ['Dystopia Residual']]
df_2017f = df_2017r[common_features + ['Dystopia Residual']]
df_2018f = df_2018r[common_features + ['Social support']]
df_2019f = df_2019r[common_features + ['Social support']]
df_2020f = df_2020r[common_features + ['Social support', 'Dystopia Residual']]
df_2021f = df_2021r[common_features + ['Social support', 'Dystopia Residual']]
df_2022f = df_2022r[common_features + ['Social support', 'Dystopia Residual']]

#### Final Dataset

In [94]:
# merge df concating rows, keeping all columns and having nan where the column does not exist
df = pd.concat([df_2015f, df_2016f, df_2017f, df_2018f, df_2019f, df_2020f, df_2021f, df_2022f] ,    # Combine vertically
                          ignore_index = True,
                          sort = False)
df = df[['Year', 'Country', 'Happiness Rank', 'Happiness Score', 'Freedom', 'Generosity', 'Healthy life expectancy', 'Perceptions of corruption', 'GDP per capita', 'Social support', 'Dystopia Residual']] # sort ds
df

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,Freedom,Generosity,Healthy life expectancy,Perceptions of corruption,GDP per capita,Social support,Dystopia Residual
0,2015,Switzerland,1,7.587,0.66557,0.29678,0.94143,0.41978,1.39651,,2.51738
1,2015,Iceland,2,7.561,0.62877,0.43630,0.94784,0.14145,1.30232,,2.70201
2,2015,Denmark,3,7.527,0.64938,0.34139,0.87464,0.48357,1.32548,,2.49204
3,2015,Norway,4,7.522,0.66973,0.34699,0.88521,0.36503,1.45900,,2.46531
4,2015,Canada,5,7.427,0.63297,0.45811,0.90563,0.32957,1.32629,,2.45176
...,...,...,...,...,...,...,...,...,...,...,...
1226,2022,Rwanda*,143,3.268,0.62100,0.18700,0.46200,0.54400,0.78500,0.133,0.53600
1227,2022,Zimbabwe,144,2.995,0.32900,0.10600,0.27000,0.10500,0.94700,0.690,0.54800
1228,2022,Lebanon,145,2.955,0.10300,0.08200,0.63100,0.03400,1.39200,0.498,0.21600
1229,2022,Afghanistan,146,2.404,0.00000,0.08900,0.28900,0.00500,0.75800,0.000,1.26300


##### Clean Data

Validate Country names

In [95]:
# Validate countries in dataset
world_countries = [country.name for country in list(pycountry.countries)]

print(list(df[~df.Country.isin(world_countries)].Year.value_counts()))  # which datasets are the problem
print(set(df[~df.Country.isin(world_countries)].Country))  # the library does not do a great job, but now is easier to filter by eye

[35, 21, 20, 20, 20, 20, 19, 18]
{'Botswana*', 'Palestinian Territories', 'North Cyprus', 'South Korea', 'Ivory Coast', 'Comoros*', 'Guatemala*', 'North Cyprus*', 'Libya*', 'Rwanda*', 'Lesotho*', 'Madagascar*', 'Bolivia', 'Kosovo', 'Liberia*', 'Mauritania*', 'Somaliland region', 'Russia', 'Luxembourg*', 'Laos', 'Trinidad & Tobago', 'Niger*', 'Congo (Brazzaville)', 'Syria', 'Congo (Kinshasa)', 'Tanzania', 'Belarus*', 'Vietnam', 'Moldova', 'Northern Cyprus', 'Eswatini, Kingdom of*', 'Chad*', 'Gambia*', 'Macedonia', 'Kuwait*', 'Venezuela', 'Somaliland Region', 'Taiwan', 'Hong Kong S.A.R., China', 'Czech Republic', 'Taiwan Province of China', 'Palestinian Territories*', 'Yemen*', 'xx', 'Hong Kong S.A.R. of China', 'Swaziland', 'Turkmenistan*', 'Azerbaijan*', 'Iran'}


In [96]:
# remove * from countries' names
df['Country'] = df['Country'].str.replace('*', '')

not_recognized = set(df[~df.Country.isin(world_countries)].Country)
not_recognized

  df['Country'] = df['Country'].str.replace('*', '')


{'Bolivia',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Czech Republic',
 'Eswatini, Kingdom of',
 'Hong Kong S.A.R. of China',
 'Hong Kong S.A.R., China',
 'Iran',
 'Ivory Coast',
 'Kosovo',
 'Laos',
 'Macedonia',
 'Moldova',
 'North Cyprus',
 'Northern Cyprus',
 'Palestinian Territories',
 'Russia',
 'Somaliland Region',
 'Somaliland region',
 'South Korea',
 'Swaziland',
 'Syria',
 'Taiwan',
 'Taiwan Province of China',
 'Tanzania',
 'Trinidad & Tobago',
 'Venezuela',
 'Vietnam',
 'xx'}

In [97]:
# review across datasets countries above filter those I am sure are right 

review_list = ['Czech', 'Macedonia', 'Congo','China','Hong Kong','Taiwan','Cyprus','Eswatini','Swaziland','Somaliland']

for cname in review_list:
    print(cname, ': ', df[df['Country'].str.contains(cname)].Country, '\n')

Czech :  30      Czech Republic
184     Czech Republic
337     Czech Republic
490     Czech Republic
645     Czech Republic
800     Czech Republic
952     Czech Republic
1101           Czechia
Name: Country, dtype: object 

Macedonia :  92            Macedonia
252           Macedonia
406           Macedonia
558           Macedonia
709     North Macedonia
871           Macedonia
1028    North Macedonia
1172    North Macedonia
Name: Country, dtype: object 

Congo :  119        Congo (Kinshasa)
138     Congo (Brazzaville)
282        Congo (Kinshasa)
284     Congo (Brazzaville)
438     Congo (Brazzaville)
440        Congo (Kinshasa)
583     Congo (Brazzaville)
601        Congo (Kinshasa)
728     Congo (Brazzaville)
752        Congo (Kinshasa)
869     Congo (Brazzaville)
912        Congo (Kinshasa)
1017    Congo (Brazzaville)
1182                  Congo
Name: Country, dtype: object 

China :  83                          China
240                         China
347      Taiwan Province of Chi

In [98]:
# unify and fix errors in country names, newer names prefered
df = df.drop(df[df.Country == 'xx'].index)

countries_name_map = {
    'Macedonia' : 'North Macedonia',
    'Congo' : 'Congo (Kinshasa)',
    'Taiwan' : 'Taiwan Province of China',
    'Swaziland' : 'Eswatini, Kingdom of',
    'Northern Cyprus' : 'North Cyprus',
    'Hong Kong' : 'Hong Kong S.A.R. of China',
    'Hong Kong S.A.R., China' : 'Hong Kong S.A.R. of China',
    'Czech Republic' : 'Czechia',
    'Trinidad & Tobago': 'Trinidad and Tobago'
}

df['Country'] = df['Country'].replace(countries_name_map)
df

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,Freedom,Generosity,Healthy life expectancy,Perceptions of corruption,GDP per capita,Social support,Dystopia Residual
0,2015,Switzerland,1,7.587,0.66557,0.29678,0.94143,0.41978,1.39651,,2.51738
1,2015,Iceland,2,7.561,0.62877,0.43630,0.94784,0.14145,1.30232,,2.70201
2,2015,Denmark,3,7.527,0.64938,0.34139,0.87464,0.48357,1.32548,,2.49204
3,2015,Norway,4,7.522,0.66973,0.34699,0.88521,0.36503,1.45900,,2.46531
4,2015,Canada,5,7.427,0.63297,0.45811,0.90563,0.32957,1.32629,,2.45176
...,...,...,...,...,...,...,...,...,...,...,...
1225,2022,Botswana,142,3.471,0.57100,0.01200,0.28000,0.10200,1.50300,0.815,0.18700
1226,2022,Rwanda,143,3.268,0.62100,0.18700,0.46200,0.54400,0.78500,0.133,0.53600
1227,2022,Zimbabwe,144,2.995,0.32900,0.10600,0.27000,0.10500,0.94700,0.690,0.54800
1228,2022,Lebanon,145,2.955,0.10300,0.08200,0.63100,0.03400,1.39200,0.498,0.21600


#### Add region

In [99]:
# Datasets from 2015 and 2016 have the region information. 
regions_df = df_2015[['Country', 'Region']].merge(df_2016[['Country', 'Region']], on='Country', how = 'outer')
regions_df['Region'] = regions_df['Region_x'].fillna(regions_df['Region_y'])
regions_df = regions_df.drop(['Region_x', 'Region_y'], axis=1)

# Lets map the names of the countries
regions_df['Country'] = regions_df['Country'].replace(countries_name_map)
regions_df

Unnamed: 0,Country,Region
0,Switzerland,Western Europe
1,Iceland,Western Europe
2,Denmark,Western Europe
3,Norway,Western Europe
4,Canada,North America
...,...,...
159,Belize,Latin America and Caribbean
160,Somalia,Sub-Saharan Africa
161,Somaliland Region,Sub-Saharan Africa
162,Namibia,Sub-Saharan Africa


In [100]:
df = df.merge(regions_df, on='Country', how='left')
df

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,Freedom,Generosity,Healthy life expectancy,Perceptions of corruption,GDP per capita,Social support,Dystopia Residual,Region
0,2015,Switzerland,1,7.587,0.66557,0.29678,0.94143,0.41978,1.39651,,2.51738,Western Europe
1,2015,Iceland,2,7.561,0.62877,0.43630,0.94784,0.14145,1.30232,,2.70201,Western Europe
2,2015,Denmark,3,7.527,0.64938,0.34139,0.87464,0.48357,1.32548,,2.49204,Western Europe
3,2015,Norway,4,7.522,0.66973,0.34699,0.88521,0.36503,1.45900,,2.46531,Western Europe
4,2015,Canada,5,7.427,0.63297,0.45811,0.90563,0.32957,1.32629,,2.45176,North America
...,...,...,...,...,...,...,...,...,...,...,...,...
1225,2022,Botswana,142,3.471,0.57100,0.01200,0.28000,0.10200,1.50300,0.815,0.18700,Sub-Saharan Africa
1226,2022,Rwanda,143,3.268,0.62100,0.18700,0.46200,0.54400,0.78500,0.133,0.53600,Sub-Saharan Africa
1227,2022,Zimbabwe,144,2.995,0.32900,0.10600,0.27000,0.10500,0.94700,0.690,0.54800,Sub-Saharan Africa
1228,2022,Lebanon,145,2.955,0.10300,0.08200,0.63100,0.03400,1.39200,0.498,0.21600,Middle East and Northern Africa


In [101]:
df[df['Country'].str.contains('Trinidad')]

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,Freedom,Generosity,Healthy life expectancy,Perceptions of corruption,GDP per capita,Social support,Dystopia Residual,Region
40,2015,Trinidad and Tobago,41,6.168,0.55884,0.31844,0.61483,0.0114,1.21183,,2.26882,Latin America and Caribbean
200,2016,Trinidad and Tobago,43,6.168,0.48453,0.31935,0.52608,0.01241,1.32572,,2.51394,Latin America and Caribbean
352,2017,Trinidad and Tobago,38,6.168,0.518631,0.325296,0.519983,0.008965,1.361356,,2.053247,Latin America and Caribbean
507,2018,Trinidad and Tobago,38,6.192,0.575,0.171,0.564,0.019,1.223,1.492,,Latin America and Caribbean
664,2019,Trinidad and Tobago,39,6.192,0.489,0.185,0.713,0.016,1.231,1.477,,Latin America and Caribbean
823,2020,Trinidad and Tobago,42,6.1919,0.552931,0.199358,0.658829,0.015078,1.167642,1.407326,2.190696,Latin America and Caribbean


In [102]:
# Check for nan values 
df[df.Region.isna()].Country.unique()

array(['Gambia', 'Maldives'], dtype=object)

In [103]:
# Check possible region values
df.Region.unique()

array(['Western Europe', 'North America', 'Australia and New Zealand',
       'Middle East and Northern Africa', 'Latin America and Caribbean',
       'Southeastern Asia', 'Central and Eastern Europe', 'Eastern Asia',
       'Sub-Saharan Africa', 'Southern Asia', nan], dtype=object)

In [108]:
# Add the regions
df.loc[df['Country'] == 'Maldives', 'Region'] = 'Southeastern Asia'
df.loc[df['Country'] == 'Gambia', 'Region'] = 'Middle East and Northern Africa'

##### Validate Data Types

In [109]:
# check for datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1230 entries, 0 to 1229
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year                       1230 non-null   int64  
 1   Country                    1230 non-null   object 
 2   Happiness Rank             1230 non-null   int64  
 3   Happiness Score            1230 non-null   float64
 4   Freedom                    1230 non-null   float64
 5   Generosity                 1230 non-null   float64
 6   Healthy life expectancy    1230 non-null   float64
 7   Perceptions of corruption  1229 non-null   float64
 8   GDP per capita             1230 non-null   float64
 9   Social support             760 non-null    float64
 10  Dystopia Residual          918 non-null    float64
 11  Region                     1230 non-null   object 
dtypes: float64(8), int64(2), object(2)
memory usage: 124.9+ KB


In [111]:
# transform to numeric
num_cols = df.drop(['Country', 'Region'], axis=1).columns
df[num_cols] = df[num_cols].apply(pd.to_numeric)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1230 entries, 0 to 1229
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year                       1230 non-null   int64  
 1   Country                    1230 non-null   object 
 2   Happiness Rank             1230 non-null   int64  
 3   Happiness Score            1230 non-null   float64
 4   Freedom                    1230 non-null   float64
 5   Generosity                 1230 non-null   float64
 6   Healthy life expectancy    1230 non-null   float64
 7   Perceptions of corruption  1229 non-null   float64
 8   GDP per capita             1230 non-null   float64
 9   Social support             760 non-null    float64
 10  Dystopia Residual          918 non-null    float64
 11  Region                     1230 non-null   object 
dtypes: float64(8), int64(2), object(2)
memory usage: 124.9+ KB


##### Save Data

In [114]:
# Save processed data
df.to_csv('../data/processed/WorldHappinees.csv', index=False)
regions_df.to_csv('../data/final/regions.csv', index=False)