In [1]:
!pip install seaborn



In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import linregress
import seaborn as sn
import os
import glob


In [3]:
def plot_linear_regression(x_values, y_values, title, text_coordinates):
    (slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
    regress_values = x_values * slope + intercept
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    
    plt.scatter(x_values,y_values)
    plt.plot(x_values,regress_values,"r-")
    plt.annotate(line_eq,text_coordinates,fontsize=14,color="red")
    plt.xlabel("Latitude")
    plt.ylabel(title)
    print(f"The r-squared is: {rvalue}")
    plt.show()

In [4]:

file = "csv_data/2015.csv"
file1 = "csv_data/2016.csv"
file2 = "csv_data/2017.csv"
file3 = "csv_data/2018.csv"
file4 = "csv_data/2019.csv"

In [5]:
# Dataframes by year

df_2015 = pd.read_csv(file)
df_2016 = pd.read_csv(file1)
df_2017 = pd.read_csv(file2)
df_2018 = pd.read_csv(file3)
df_2019 = pd.read_csv(file4)

In [6]:
df_2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [7]:
# Add column to each Dataframe to identify each year

# df_2015['Year'] = '2015'
# df_2016['Year'] = '2016'
# df_2017['Year'] = '2017'
# df_2018['Year'] = '2018'
# df_2019['Year'] = '2019'


In [8]:
# 2019 clean up
df_2019.isna().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64

In [9]:
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [10]:
df_2015.columns

Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Standard Error', 'Economy (GDP per Capita)', 'Family',
       'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)',
       'Generosity', 'Dystopia Residual'],
      dtype='object')

In [11]:
# rename columns in 2019df
df_2019.columns = df_2019.columns.str.lower()
df_2019.rename(columns = {"overall rank": "rank", "country or region": "country", 
                          "freedom to make life choices": "freedom", 'perceptions of corruption': 'corruption',
                          'gdp per capita': 'gdp', 'healthy life expectancy': 'health', 'social support': 'family'}, inplace=True)
df_2019.columns = df_2019.columns.str.replace(" ", "_")
df_2019.columns
                          

Index(['rank', 'country', 'score', 'gdp', 'family', 'health', 'freedom',
       'generosity', 'corruption'],
      dtype='object')

In [12]:
df_2019.sample(5)

Unnamed: 0,rank,country,score,gdp,family,health,freedom,generosity,corruption
71,72,Libya,5.525,1.044,1.303,0.673,0.416,0.133,0.152
116,117,Iran,4.548,1.1,0.842,0.785,0.305,0.27,0.125
143,144,Lesotho,3.802,0.489,1.169,0.168,0.359,0.107,0.093
115,116,Armenia,4.559,0.85,1.055,0.815,0.283,0.095,0.064
14,15,United Kingdom,7.054,1.333,1.538,0.996,0.45,0.348,0.278


In [13]:
# 2018 df clean up
df_2018.isna().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       1
dtype: int64

In [14]:
# we have a NaN value


df_2018[df_2018.isna().values]

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
19,20,United Arab Emirates,6.774,2.096,0.776,0.67,0.284,0.186,


In [15]:
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     155 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [16]:
df_2018.columns = df_2018.columns.str.lower()
df_2018.rename(columns = {'overall rank': 'rank', 'country or region': 'country',
                          'freedom to make life choices': 'freedom', 'perceptions of corruption': 'corruption',
                          'gdp per capita': 'gdp', 'healthy life expectancy': 'health',
                          'social support': 'family'}, inplace=True)

df_2018.columns =  df_2018.columns.str.replace(" ", "_")

In [17]:
df_2018.sample(5)

Unnamed: 0,rank,country,score,gdp,family,health,freedom,generosity,corruption
80,81,Montenegro,5.347,1.017,1.279,0.729,0.259,0.111,0.081
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
24,25,Chile,6.476,1.131,1.331,0.808,0.431,0.197,0.061
76,77,Portugal,5.41,1.188,1.429,0.884,0.562,0.055,0.017
120,121,Burkina Faso,4.424,0.314,1.097,0.254,0.312,0.175,0.128


In [18]:
# 2017 cleanup
df_2017.isna().sum()

Country                          0
Happiness.Rank                   0
Happiness.Score                  0
Whisker.high                     0
Whisker.low                      0
Economy..GDP.per.Capita.         0
Family                           0
Health..Life.Expectancy.         0
Freedom                          0
Generosity                       0
Trust..Government.Corruption.    0
Dystopia.Residual                0
dtype: int64

In [19]:
df_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        155 non-null    object 
 1   Happiness.Rank                 155 non-null    int64  
 2   Happiness.Score                155 non-null    float64
 3   Whisker.high                   155 non-null    float64
 4   Whisker.low                    155 non-null    float64
 5   Economy..GDP.per.Capita.       155 non-null    float64
 6   Family                         155 non-null    float64
 7   Health..Life.Expectancy.       155 non-null    float64
 8   Freedom                        155 non-null    float64
 9   Generosity                     155 non-null    float64
 10  Trust..Government.Corruption.  155 non-null    float64
 11  Dystopia.Residual              155 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 

In [20]:
# remove the dots
df_2017.columns = df_2017.columns.str.replace('.', '')

df_2017.columns = df_2017.columns.str.lower()
df_2017.rename(columns = {'happinessrank': 'rank', 'happinessscore': 'score', 'healthlifeexpectancy': 'health',
                          'economygdppercapita': 'gdp', 'trustgovernmentcorruption':'corruption',
                          'dystopiaresidual': 'dystopia'}, inplace=True)

df_2017.columns


Index(['country', 'rank', 'score', 'whiskerhigh', 'whiskerlow', 'gdp',
       'family', 'health', 'freedom', 'generosity', 'corruption', 'dystopia'],
      dtype='object')

In [21]:
df_2017.drop(columns = ['whiskerhigh', 'whiskerlow'], inplace=True)


In [22]:
df_2017.columns

Index(['country', 'rank', 'score', 'gdp', 'family', 'health', 'freedom',
       'generosity', 'corruption', 'dystopia'],
      dtype='object')

In [23]:
df_2017.sample(5)

Unnamed: 0,country,rank,score,gdp,family,health,freedom,generosity,corruption,dystopia
48,Russia,49,5.963,1.281778,1.469282,0.547349,0.373783,0.052264,0.032963,2.205607
58,Turkmenistan,59,5.822,1.130777,1.493149,0.437726,0.418272,0.249925,0.25927,1.83291
123,Congo (Brazzaville),124,4.291,0.808964,0.832044,0.289957,0.435026,0.120852,0.079618,1.724136
87,Lebanon,88,5.225,1.074988,1.129624,0.735081,0.288516,0.264451,0.037514,1.695074
43,Ecuador,44,6.008,1.00082,1.286169,0.685636,0.455198,0.150112,0.140135,2.290353


In [24]:
# 2016 cleanup
df_2016.isna().sum()

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Lower Confidence Interval        0
Upper Confidence Interval        0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

In [25]:
df_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        157 non-null    object 
 1   Region                         157 non-null    object 
 2   Happiness Rank                 157 non-null    int64  
 3   Happiness Score                157 non-null    float64
 4   Lower Confidence Interval      157 non-null    float64
 5   Upper Confidence Interval      157 non-null    float64
 6   Economy (GDP per Capita)       157 non-null    float64
 7   Family                         157 non-null    float64
 8   Health (Life Expectancy)       157 non-null    float64
 9   Freedom                        157 non-null    float64
 10  Trust (Government Corruption)  157 non-null    float64
 11  Generosity                     157 non-null    float64
 12  Dystopia Residual              157 non-null    flo

In [26]:
df_2016.columns = df_2016.columns.str.lower()
df_2016.rename(columns={'happiness rank': 'rank', 'happiness score': 'score', 'economy (gdp per capita)':'gdp',
                        'health (life expectancy)': 'health', 'trust (government corruption)': 'corruption',
                        'dystopia residual':  'dystopia'}, inplace=True)

df_2016.columns

Index(['country', 'region', 'rank', 'score', 'lower confidence interval',
       'upper confidence interval', 'gdp', 'family', 'health', 'freedom',
       'corruption', 'generosity', 'dystopia'],
      dtype='object')

In [27]:
df_2016.drop(columns= ['lower confidence interval', 'upper confidence interval','region'], inplace=True)

df_2016.sample(5)

Unnamed: 0,country,rank,score,gdp,family,health,freedom,corruption,generosity,dystopia
143,Chad,144,3.763,0.42214,0.63178,0.03824,0.12807,0.04952,0.18667,2.30637
111,Iraq,112,4.575,1.07474,0.59205,0.51076,0.24856,0.13636,0.19589,1.81657
151,Rwanda,152,3.515,0.32846,0.61586,0.31865,0.5432,0.50521,0.23552,0.96819
92,Lebanon,93,5.129,1.12268,0.64184,0.76171,0.26228,0.03061,0.23693,2.07339
99,Tajikistan,100,4.996,0.48835,0.75602,0.53119,0.43408,0.13509,0.25998,2.39106


In [28]:
# 2015 cleanup

df_2015.isna().sum()


Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Standard Error                   0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

In [29]:
df_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

In [30]:
df_2015.rename(columns={'Happiness Rank':'rank', 'Happiness Score':'score', 'Economy (GDP per Capita)': 'gdp',
                        'Health (Life Expectancy)':'health', 'Trust (Government Corruption)': 'corruption',
                       'Dystopia Residual':'dystopia'}, inplace=True)

df_2015.columns = df_2015.columns.str.lower()
df_2015.columns

Index(['country', 'region', 'rank', 'score', 'standard error', 'gdp', 'family',
       'health', 'freedom', 'corruption', 'generosity', 'dystopia'],
      dtype='object')

In [31]:
df_2015.drop(columns= ['region','standard error'], inplace=True)
df_2015.sample(5)

Unnamed: 0,country,rank,score,gdp,family,health,freedom,corruption,generosity,dystopia
67,Algeria,68,5.605,0.93929,1.07772,0.61766,0.28579,0.17383,0.07822,2.43209
89,Philippines,90,5.073,0.70532,1.03516,0.58114,0.62545,0.12279,0.24991,1.7536
85,Romania,86,5.124,1.04345,0.88588,0.7689,0.35068,0.00649,0.13748,1.93129
56,Nicaragua,57,5.828,0.59325,1.14184,0.74314,0.55475,0.19317,0.27815,2.32407
99,Mongolia,100,4.874,0.82819,1.3006,0.60268,0.43626,0.02666,0.3323,1.34759


In [32]:
def print_all_dataframe_columns():
    print("2015:\n", df_2015.columns.to_list())
    print("\n2016:\n", df_2016.columns.to_list())
    print("\n2017:\n", df_2017.columns.to_list())
    print("\n2018:\n", df_2018.columns.to_list())
    print("\n2019:\n", df_2019.columns.to_list())
    
print_all_dataframe_columns()

2015:
 ['country', 'rank', 'score', 'gdp', 'family', 'health', 'freedom', 'corruption', 'generosity', 'dystopia']

2016:
 ['country', 'rank', 'score', 'gdp', 'family', 'health', 'freedom', 'corruption', 'generosity', 'dystopia']

2017:
 ['country', 'rank', 'score', 'gdp', 'family', 'health', 'freedom', 'generosity', 'corruption', 'dystopia']

2018:
 ['rank', 'country', 'score', 'gdp', 'family', 'health', 'freedom', 'generosity', 'corruption']

2019:
 ['rank', 'country', 'score', 'gdp', 'family', 'health', 'freedom', 'generosity', 'corruption']


In [33]:
# add dystopia column to data frame
df_2018['dystopia'] = 0.0
df_2019['dystopia'] = 0.0

In [34]:
print_all_dataframe_columns()

2015:
 ['country', 'rank', 'score', 'gdp', 'family', 'health', 'freedom', 'corruption', 'generosity', 'dystopia']

2016:
 ['country', 'rank', 'score', 'gdp', 'family', 'health', 'freedom', 'corruption', 'generosity', 'dystopia']

2017:
 ['country', 'rank', 'score', 'gdp', 'family', 'health', 'freedom', 'generosity', 'corruption', 'dystopia']

2018:
 ['rank', 'country', 'score', 'gdp', 'family', 'health', 'freedom', 'generosity', 'corruption', 'dystopia']

2019:
 ['rank', 'country', 'score', 'gdp', 'family', 'health', 'freedom', 'generosity', 'corruption', 'dystopia']


In [35]:
# add year columns to all data frame
df_2015['Year'] = '2015'
df_2016['Year'] = '2016'
df_2017['Year'] = '2017'
df_2018['Year'] = '2018'
df_2019['Year'] = '2019'

In [36]:
print_all_dataframe_columns()

2015:
 ['country', 'rank', 'score', 'gdp', 'family', 'health', 'freedom', 'corruption', 'generosity', 'dystopia', 'Year']

2016:
 ['country', 'rank', 'score', 'gdp', 'family', 'health', 'freedom', 'corruption', 'generosity', 'dystopia', 'Year']

2017:
 ['country', 'rank', 'score', 'gdp', 'family', 'health', 'freedom', 'generosity', 'corruption', 'dystopia', 'Year']

2018:
 ['rank', 'country', 'score', 'gdp', 'family', 'health', 'freedom', 'generosity', 'corruption', 'dystopia', 'Year']

2019:
 ['rank', 'country', 'score', 'gdp', 'family', 'health', 'freedom', 'generosity', 'corruption', 'dystopia', 'Year']


In [37]:
# combine dataframes
main_df = pd.concat([df_2015, df_2016, df_2017, df_2018, df_2019])


In [39]:
main_df
main_df.country = main_df.country.str.lower().str.replace(' ','_')

In [40]:
main_df.isna().sum()

country       0
rank          0
score         0
gdp           0
family        0
health        0
freedom       0
corruption    1
generosity    0
dystopia      0
Year          0
dtype: int64

In [41]:
main_df[main_df.isna().values]

Unnamed: 0,country,rank,score,gdp,family,health,freedom,corruption,generosity,dystopia,Year
19,united_arab_emirates,20,6.774,2.096,0.776,0.67,0.284,,0.186,0.0,2018


In [43]:
main_df.loc[main_df.country == 'united_arab_emirates']

Unnamed: 0,country,rank,score,gdp,family,health,freedom,corruption,generosity,dystopia,Year
19,united_arab_emirates,20,6.901,1.42727,1.12575,0.80925,0.64157,0.38583,0.26428,2.24743,2015
27,united_arab_emirates,28,6.573,1.57352,0.87114,0.72993,0.56215,0.35561,0.26591,2.21507,2016
20,united_arab_emirates,21,6.648,1.626343,1.26641,0.726798,0.608345,0.32449,0.360942,1.734704,2017
19,united_arab_emirates,20,6.774,2.096,0.776,0.67,0.284,,0.186,0.0,2018
20,united_arab_emirates,21,6.825,1.503,1.31,0.825,0.598,0.182,0.262,0.0,2019


In [45]:
avg_corr = main_df.loc[main_df.country == 'united_arab_emirates', 'corruption'].mean()

In [46]:
main_df.corruption.fillna(avg_corr, inplace=True)

In [47]:
main_df.isna().sum().any()

False

In [None]:
# stack data frames
#  df.to_csv('results.csv')

In [None]:
# df = pd.concat([df_2015, df_2016, df_2017, df_2018, df_2019],
#       axis=1,
# #     join="outer",
# #     ignore_index=False,
# #     keys=None,
# #     levels=None,
# #     names=None,
# #     verify_integrity=False,
# #     copy=True,
#       inplace=True,)
# df.head()