In [None]:
import pandas as pd
import requests
from functools import reduce
import numpy as np
from bs4 import BeautifulSoup

# Process

I create two csv files here.

The first is includes the happiness score from the World Happiness Report for each country along with web scraped data of each countries, GDP, Social Progress, and Life Expectancy.

The second is of the World Happiness Report's weighting of 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption', and 'Generosity'. (Kaggle data)

I decided to focus mainly on GDP, Social Progress, and Life Expectancy for the report. I am sure the other categories will have something interesting to say my exploration.

I then added extra information of which I think may be relevant such as offical langauges, population, lat/lng, and more.



# Finding Happiness (Score)

In [None]:
df_2019 = pd.read_csv("2019.csv")
df_2019 = df_2019[["Overall rank", "Score", "Country or region"]].rename(columns={"Country or region": "Country"})

# Web Scraping for:
- GDP per captia
- Social Progress
- Life Expectancy

In [None]:
# GDP per captia - 2019 is hidden below 2021
response = requests.get("https://www.imf.org/en/Publications/WEO/weo-database/2021/October/weo-report?c=512,914,612,171,614,311,213,911,314,193,122,912,313,419,513,316,913,124,339,638,514,218,963,616,223,516,918,748,618,624,522,622,156,626,628,228,924,233,632,636,634,238,662,960,423,935,128,611,321,243,248,469,253,642,643,939,734,644,819,172,132,646,648,915,134,652,174,328,258,656,654,336,263,268,532,944,176,534,536,429,433,178,436,136,343,158,439,916,664,826,542,967,443,917,544,941,446,666,668,672,946,137,546,674,676,548,556,678,181,867,682,684,273,868,921,948,943,686,688,518,728,836,558,138,196,278,692,694,962,142,449,564,565,283,853,288,293,566,964,182,359,453,968,922,714,862,135,716,456,722,942,718,724,576,936,961,813,726,199,733,184,524,361,362,364,732,366,144,146,463,528,923,738,578,537,742,866,369,744,186,925,869,746,926,466,112,111,298,927,846,299,582,487,474,754,698,&s=NGDPD,&sy=2021&ey=2021&ssm=0&scsm=1&scc=0&ssd=1&ssc=0&sic=0&sort=country&ds=.&br=1")

In [None]:
soup = BeautifulSoup(response.content, "html.parser")
rows = soup.find_all("tr")

data = []
for row in rows[198:394]: # Year: 2019
  # Get all the cells in the row
  cells = row.find_all("td")

  # The information is the text inbetween the tags
  country = cells[0].text
  gdp = cells[-2].text # in billions

  data.append({
      "Country": country,
      "GDP (Billions)": gdp
  })

df_2019 = df_2019.merge(pd.DataFrame(data), on="Country")

In [None]:
# Social Progress Index (Overall Score, Basic Human Needs, Foundation of Wellbeing, Opportunity)
response = requests.get("https://en.wikipedia.org/wiki/Social_Progress_Index")

In [None]:
soup = BeautifulSoup(response.content, "html.parser")
table = soup.find_all("table")
rows = table[1].find_all("tr")

data = []
for row in rows[2:]:
  cells = row.find_all("td")
  
  name_tag = cells[0].find("a")
  country = name_tag.text

  score = float(cells[2].text)
  basic_human_needs = float(cells[3].text)
  wellbeing = float(cells[4].text)
  opportunity = float(cells[5].text)

  data.append({
      "Country": country,
      "Social Score": score,
      "Basic Human Needs": basic_human_needs,
      "Foundation of Wellbeing": wellbeing,
      "Opportunity": opportunity
  })

df_2019 = df_2019.merge(pd.DataFrame(data), on="Country")

In [None]:
# Life Expectancy
response = requests.get("https://worldpopulationreview.com/countries/life-expectancy")

In [None]:
soup = BeautifulSoup(response.content, "html.parser")
table = soup.find_all("tbody")

rows = table[0].find_all("tr")

data = []
for row in rows:
  cells = row.find_all("td")
  
  name_tag = cells[0].find("a")
  country = name_tag.text

  total_LE = float(cells[1].text)

  data.append({
      "Country": country,
      "Life Expectancy": total_LE
  })

df_2019 = df_2019.merge(pd.DataFrame(data), on="Country")

In [None]:
# Perceptions of Corruption - ignored due to small impact
response = requests.get("https://www.transparency.org/en/cpi/2020")

In [None]:
df_2019.head(5)

Unnamed: 0,Overall rank,Score,Country,GDP (Billions),Social Score,Basic Human Needs,Foundation of Wellbeing,Opportunity,Life Expectancy
0,1,7.769,Finland,296.016,91.89,96.22,91.29,88.15,82.312
1,2,7.600,Denmark,396.666,92.11,96.11,91.58,88.66,81.256
2,3,7.554,Norway,445.507,92.73,96.85,93.39,87.95,82.788
3,4,7.494,Iceland,25.476,91.09,98.07,92.81,82.39,83.370
4,5,7.488,Netherlands,1007.562,91.06,96.48,91.18,85.53,82.636
...,...,...,...,...,...,...,...,...,...
129,152,3.334,Rwanda,10.395,54.13,54.52,62.65,45.21,69.688
130,153,3.231,Tanzania,69.238,56.20,56.26,62.91,49.42,66.080
131,154,3.203,Afghanistan,,42.29,52.90,39.50,34.47,65.632
132,155,3.083,Central African Republic,2.587,31.62,21.31,38.68,34.85,54.022


# Cleaning Kaggle Data

In [None]:
k_2015 = pd.read_csv("2015.csv")
k_2016 = pd.read_csv("2016.csv")
k_2017 = pd.read_csv("2017.csv")
k_2018 = pd.read_csv("2018.csv")
k_2019 = pd.read_csv("2019.csv")

# common_columns = ['Overall rank', 'Country or region', 'Score', 'GDP per capita',
#        'Social support', 'Healthy life expectancy',
#        'Freedom to make life choices', 'Generosity',
#        'Perceptions of corruption']    

k_2015.isnull().values.any(), k_2016.isnull().values.any(), k_2017.isnull().values.any(), k_2018.isnull().values.any(), k_2019.isnull().values.any() 

(False, False, False, True, False)

In [None]:
# clean 2019
k_2019 = k_2019.rename(columns={"Country or region": "Country"})
k_2019["Year"] = 2019

# clean 2018 - same columns as 2019 but has NaN values in percetptions of corruption
# k_2018['Perceptions of corruption'].isnull().values.any()
k_2018 = k_2018.rename(columns={"Country or region": "Country"})
k_2018 = k_2018.fillna(k_2018["Perceptions of corruption"].mean())
k_2018["Year"] = 2018

# clean 2017 - match columns to 2019
k_2017 = k_2017.drop(columns=["Whisker.high", "Whisker.low", "Dystopia.Residual"]) 
k_2017.columns = ['Country', 'Overall rank', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption']
k_2017["Year"] = 2017
       
# clean 2016 - match columns to 2019
country_region = dict(zip(k_2016["Country"], k_2016["Region"]))
k_2016 = k_2016.drop(columns=["Lower Confidence Interval", "Upper Confidence Interval", "Dystopia Residual", "Region"])
k_2016.columns = ['Country', 'Overall rank', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices',
       'Perceptions of corruption', 'Generosity'] 
k_2016["Year"] = 2016

# clean 2015 - match columns to 2019
k_2015 = k_2015.drop(columns=["Standard Error", "Dystopia Residual", "Region"])
k_2015.columns = ['Country', 'Overall rank', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices',
       'Perceptions of corruption', 'Generosity']
k_2015["Year"] = 2015       

In [None]:
# fill missing regions
df_countries = k_2019.append([k_2018, k_2017, k_2016, k_2015]).reset_index()
df_countries = df_countries.drop("index", axis=1)
df_countries["Region"] = df_countries["Country"].map(country_region)

nan_region = {"Trinidad & Tobago" : "Latin America and Caribbean",
              "Northern Cyprus" : "Middle East and Northern Africa",
              "North Macedonia" : "Western Europe",
              "Gambia" : "Sub-Saharan Africa",
              "Mozambique" : "Sub-Saharan Africa",
              "Swaziland" : "Sub-Saharan Africa",
              "Lesotho" : "Sub-Saharan Africa",
              "Central African Republic" : "Sub-Saharan Africa",
              "Taiwan Province of China" : "Eastern Asia",
              "Hong Kong S.A.R., China" : "Eastern Asia",
              "Oman" : "Middle East and Northern Africa",
              "Somaliland region" : "Sub-Saharan Africa",
              "Djibouti" : "Sub-Saharan Africa"}

df_countries["Region"] = df_countries.set_index('Country')['Region'].fillna(nan_region).reset_index()['Region']

In [None]:
# matching country names for step below
df_countries["Country"] = ["Taiwan" if c  == "Taiwan Province of China" else c for c in df_countries["Country"]]
df_countries["Country"] = ["Eswatini" if c  == "Swaziland" else c for c in df_countries["Country"]]
df_countries["Country"] = ["Cyprus" if c  == "Northern Cyprus" else c for c in df_countries["Country"]]
df_countries["Country"] = ["Cyprus" if c  == "North Cyprus" else c for c in df_countries["Country"]]
df_countries["Country"] = ["Democratic Republic of the Congo" if c  == "Congo (Brazzaville)" else c for c in df_countries["Country"]]
df_countries["Country"] = ["Democratic Republic of the Congo" if c  == "Congo (Kinshasa)" else c for c in df_countries["Country"]]
df_countries["Country"] = ["North Macedonia" if c  == "Macedonia" else c for c in df_countries["Country"]]
df_countries["Country"] = ["Palestine" if c  == "Palestinian Territories" else c for c in df_countries["Country"]]
df_countries["Country"] = ["Hong Kong" if c  == "Hong Kong S.A.R., China" else c for c in df_countries["Country"]]
df_countries["Country"] = ["Somalia" if c  == "Somaliland Region" else c for c in df_countries["Country"]]
df_countries["Country"] = ["Somalia" if c  == "Somaliland region" else c for c in df_countries["Country"]]
df_countries["Country"] = ["Trinidad and Tobago" if c  == "Trinidad & Tobago" else c for c in df_countries["Country"]]

In [None]:
df_countries

Unnamed: 0,Overall rank,Country,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Year,Region,language,population,lat,lng,independent,landlocked,cca3
0,1,Finland,7.769,1.34000,1.58700,0.98600,0.59600,0.15300,0.39300,2019,Western Europe,"{'fin': 'Finnish', 'swe': 'Swedish'}",5530719,64.0,26.000000,True,False,FIN
1,2,Denmark,7.600,1.38300,1.57300,0.99600,0.59200,0.25200,0.41000,2019,Western Europe,{'dan': 'Danish'},5831404,56.0,10.000000,True,False,DNK
2,3,Norway,7.554,1.48800,1.58200,1.02800,0.60300,0.27100,0.34100,2019,Western Europe,"{'nno': 'Norwegian Nynorsk', 'nob': 'Norwegian...",5379475,62.0,10.000000,True,False,NOR
3,4,Iceland,7.494,1.38000,1.62400,1.02600,0.59100,0.35400,0.11800,2019,Western Europe,{'isl': 'Icelandic'},366425,65.0,-18.000000,True,False,ISL
4,5,Netherlands,7.488,1.39600,1.52200,0.99900,0.55700,0.32200,0.29800,2019,Western Europe,{'nld': 'Dutch'},16655799,52.5,5.750000,True,False,NLD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,154,Rwanda,3.465,0.22208,0.77370,0.42864,0.59201,0.22628,0.55191,2015,Sub-Saharan Africa,"{'eng': 'English', 'fra': 'French', 'kin': 'Ki...",12952209,-2.0,30.000000,True,True,RWA
778,155,Benin,3.340,0.28665,0.35386,0.31910,0.48450,0.18260,0.08010,2015,Sub-Saharan Africa,{'fra': 'French'},12123198,9.5,2.250000,True,False,BEN
779,156,Syria,3.006,0.66320,0.47489,0.72193,0.15684,0.47179,0.18906,2015,Middle East and Northern Africa,{'ara': 'Arabic'},17500657,35.0,38.000000,True,False,SYR
780,157,Burundi,2.905,0.01530,0.41587,0.22396,0.11850,0.19727,0.10062,2015,Sub-Saharan Africa,"{'fra': 'French', 'run': 'Kirundi'}",11890781,-3.5,30.000000,True,True,BDI


# Adding additional data from RestCountries API

- Offical Language
- Population
- Lat, Lng
- Is independent?
- Is landlocked?
- cca3 (ex. Mexico -> MEX) for country borders

https://restcountries.com/#api-endpoints-v3-language

In [None]:
API_KEY = "2d6ddd68c696c5697a3171f90e49a72b"

countries = df_countries["Country"].unique()
response = requests.get("https://restcountries.com/v3.1/all").json()

temp_countries = []
language = []
population = []
lat = []
lng = []
independent = []
landlocked = []
cca3 = []

# a check if an attribute exists for a country
def check(c, attribute):
  if attribute in c.keys():
    return c[attribute]
  else:
    return None

for c in response:
  if c["name"]["common"] in countries:
    temp_countries.append(c["name"]["common"])
    language.append(check(c, "languages"))
    population.append(check(c, "population"))
    lat.append(check(c, "latlng")[0])
    lng.append(check(c, "latlng")[1])
    independent.append(check(c, "independent"))
    landlocked.append(check(c, "landlocked"))
    cca3.append(check(c, "cca3"))
  elif c["name"]["official"] in countries:
    temp_countries.append(c["name"]["official"])
    language.append(check(c, "languages"))
    population.append(check(c, "population"))
    lat.append(check(c, "latlng")[0])
    lng.append(check(c, "latlng")[1])
    independent.append(check(c, "independent"))
    landlocked.append(check(c, "landlocked"))
    cca3.append(check(c, "cca3"))

countries = temp_countries

dict_language = dict(zip(countries, language))
dict_population = dict(zip(countries, population))
dict_lat = dict(zip(countries, lat))
dict_lng = dict(zip(countries, lng))
dict_independent = dict(zip(countries, independent))
dict_landlocked = dict(zip(countries, landlocked))
dict_cca3 = dict(zip(countries, cca3))

df_countries["language"] = df_countries["Country"].map(dict_language)
df_countries["population"] = df_countries["Country"].map(dict_population)
df_countries["lat"] = df_countries["Country"].map(dict_lat)
df_countries["lng"] = df_countries["Country"].map(dict_lng)
df_countries["independent"] = df_countries["Country"].map(dict_independent)
df_countries["landlocked"] = df_countries["Country"].map(dict_landlocked)
df_countries["cca3"] = df_countries["Country"].map(dict_cca3)

df_2019["language"] = df_2019["Country"].map(dict_language)
df_2019["population"] = df_2019["Country"].map(dict_population)
df_2019["lat"] = df_2019["Country"].map(dict_lat)
df_2019["lng"] = df_2019["Country"].map(dict_lng)
df_2019["independent"] = df_2019["Country"].map(dict_independent)
df_2019["landlocked"] = df_2019["Country"].map(dict_landlocked)
df_2019["cca3"] = df_2019["Country"].map(dict_cca3)

In [None]:
# Finding last nan data in df_countries
df_countries[df_countries["independent"].isna()]
# Kosovo is independent as of 2008 from Syria
df_countries["independent"] = df_countries["independent"].fillna(True)

In [None]:
# Filling in missing values manually

# df_2019["GDP (Billions)"].value_counts()
# df_2019["Social Score"].value_counts()
# df_2019["Basic Human Needs"].value_counts()
# df_2019["Foundation of Wellbeing"].value_counts()
# df_2019["Opportunity"].value_counts()
# df_2019["Life Expectancy"].value_counts()

# GDP
df_2019.loc[60, "GDP (Billions)"] = "263.7" # Pakistan
df_2019.loc[80, "GDP (Billions)"] = "33.38" # Lebanon
df_2019.loc[131, "GDP (Billions)"] = "19.81" # Afghanistan

# GDP per Capita
df_2019["GDP (Billions)"] = df_2019["GDP (Billions)"].apply(lambda s: float(s.replace(",", "")))
df_2019["GDP per Capita"] = df_2019["GDP (Billions)"].divide(df_2019["population"])

# Adding Region
df_2019["Region"] = df_2019["Country"].map(country_region)
df_2019["Region"] = df_2019.set_index('Country')['Region'].fillna(nan_region).reset_index()['Region']

In [None]:
from google.colab import files

df_countries.to_csv('world_happiness_report_data.csv', encoding = 'utf-8-sig') 
files.download('world_happiness_report_data.csv')

df_2019.to_csv("happiness_real_data.csv", encoding = 'utf-8-sig')
files.download('happiness_real_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_2019

Unnamed: 0,Overall rank,Score,Country,GDP (Billions),Social Score,Basic Human Needs,Foundation of Wellbeing,Opportunity,Life Expectancy,language,population,lat,lng,independent,landlocked,cca3,GDP per Capita,Region
0,1,7.769,Finland,296.016,91.89,96.22,91.29,88.15,82.312,"{'fin': 'Finnish', 'swe': 'Swedish'}",5530719,64.0,26.00,True,False,FIN,5.352216e-05,Western Europe
1,2,7.600,Denmark,396.666,92.11,96.11,91.58,88.66,81.256,{'dan': 'Danish'},5831404,56.0,10.00,True,False,DNK,6.802238e-05,Western Europe
2,3,7.554,Norway,445.507,92.73,96.85,93.39,87.95,82.788,"{'nno': 'Norwegian Nynorsk', 'nob': 'Norwegian...",5379475,62.0,10.00,True,False,NOR,8.281607e-05,Western Europe
3,4,7.494,Iceland,25.476,91.09,98.07,92.81,82.39,83.370,{'isl': 'Icelandic'},366425,65.0,-18.00,True,False,ISL,6.952582e-05,Western Europe
4,5,7.488,Netherlands,1007.562,91.06,96.48,91.18,85.53,82.636,{'nld': 'Dutch'},16655799,52.5,5.75,True,False,NLD,6.049317e-05,Western Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,152,3.334,Rwanda,10.395,54.13,54.52,62.65,45.21,69.688,"{'eng': 'English', 'fra': 'French', 'kin': 'Ki...",12952209,-2.0,30.00,True,True,RWA,8.025658e-07,Sub-Saharan Africa
130,153,3.231,Tanzania,69.238,56.20,56.26,62.91,49.42,66.080,"{'eng': 'English', 'swa': 'Swahili'}",59734213,-6.0,35.00,True,False,TZA,1.159101e-06,Sub-Saharan Africa
131,154,3.203,Afghanistan,19.810,42.29,52.90,39.50,34.47,65.632,"{'prs': 'Dari', 'pus': 'Pashto', 'tuk': 'Turkm...",40218234,33.0,65.00,True,True,AFG,4.925627e-07,Southern Asia
132,155,3.083,Central African Republic,2.587,31.62,21.31,38.68,34.85,54.022,"{'fra': 'French', 'sag': 'Sango'}",4829764,7.0,21.00,True,True,CAF,5.356369e-07,Sub-Saharan Africa
