In [2]:
import pandas as pd
import numpy as np
import csv
from scipy.stats import pearsonr

In [3]:
df_2015 = pd.read_csv("../cleaned_data/2015.csv")
df_2016 = pd.read_csv("../cleaned_data/2016.csv")
df_2017 = pd.read_csv("../cleaned_data/2017.csv")
df_2018 = pd.read_csv("../cleaned_data/2018.csv")
df_2019 = pd.read_csv("../cleaned_data/2019.csv")
df_2019.head()

Unnamed: 0,Country,Region,Happiness_Rank,Happiness_Score,Economy,Family,Health,Freedom,Trust,Generosity,Dystopia_Residual
0,Afghanistan,Southern Asia,154,3.203,0.35,0.517,0.361,0.0,0.025,0.158,1.792
1,Albania,Central and Eastern Europe,107,4.719,0.947,0.848,0.874,0.383,0.027,0.178,1.462
2,Algeria,Middle East and Northern Africa,88,5.211,1.002,1.16,0.785,0.086,0.114,0.073,1.991
3,Argentina,Latin America and Caribbean,47,6.086,1.092,1.432,0.881,0.471,0.05,0.066,2.094
4,Armenia,Central and Eastern Europe,116,4.559,0.85,1.055,0.815,0.283,0.064,0.095,1.397


In [4]:
# Summary Statistics for 2019
df_2019.loc[:,"Happiness_Score"].describe()

count    156.000000
mean       5.407096
std        1.113120
min        2.853000
25%        4.544500
50%        5.379500
75%        6.184500
max        7.769000
Name: Happiness_Score, dtype: float64

In [5]:
# Group by Region
regions_2019 = df_2019.groupby("Region").mean()
regions_2019

Unnamed: 0_level_0,Happiness_Rank,Happiness_Score,Economy,Family,Health,Freedom,Trust,Generosity,Dystopia_Residual
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Australia and New Zealand,9.5,7.2675,1.3375,1.5525,1.031,0.571,0.335,0.331,2.1095
Central and Eastern Europe,70.965517,5.561517,1.021069,1.338793,0.809517,0.357517,0.061759,0.142724,1.830138
Eastern Asia,64.833333,5.688833,1.235167,1.3335,0.953333,0.372167,0.119667,0.172833,1.502167
Latin America and Caribbean,51.380952,5.954429,0.924286,1.339714,0.811857,0.440905,0.072048,0.144619,2.221
Middle East and Northern Africa,84.894737,5.237,1.059053,1.148684,0.751053,0.317947,0.105,0.153474,1.701789
North America,14.0,7.085,1.399,1.481,0.9565,0.519,0.218,0.2825,2.229
Southeastern Asia,85.111111,5.273667,0.93,1.246444,0.745333,0.543778,0.123444,0.301778,1.382889
Southern Asia,115.857143,4.526857,0.650286,0.986857,0.617,0.386286,0.093429,0.234714,1.558286
Sub-Saharan Africa,124.8,4.30005,0.44375,0.916675,0.40755,0.33735,0.0945,0.187725,1.9125
Western Europe,23.380952,6.84219,1.357333,1.472857,1.01519,0.479429,0.218381,0.219619,2.079381


In [6]:
# Summary statistics for 2019 Health
regions_2019["Health"].describe()

count    10.000000
mean      0.809833
std       0.194315
min       0.407550
25%       0.746763
50%       0.810687
75%       0.955708
max       1.031000
Name: Health, dtype: float64

In [7]:
# Slice the df
df2_2019 = df_2019.iloc[:,3:11]
df2_2019.head()

Unnamed: 0,Happiness_Score,Economy,Family,Health,Freedom,Trust,Generosity,Dystopia_Residual
0,3.203,0.35,0.517,0.361,0.0,0.025,0.158,1.792
1,4.719,0.947,0.848,0.874,0.383,0.027,0.178,1.462
2,5.211,1.002,1.16,0.785,0.086,0.114,0.073,1.991
3,6.086,1.092,1.432,0.881,0.471,0.05,0.066,2.094
4,4.559,0.85,1.055,0.815,0.283,0.064,0.095,1.397


In [45]:
years = [2015, 2016, 2017, 2018, 2019]

# For each year
for k in range(len(years)):    
    
    # Read input file
    old_df = pd.read_csv(f"../cleaned_data/{years[k]}.csv")

    # Slice the df
    new_df = old_df.iloc[:,3:11]
    
    # Create a correlations table
    corr_table = []

    # Add factor titles in first row
    headers = new_df.columns
    headers = headers.insert(0,"")
    corr_table.append(headers)
    
    # First factor index
    for i in range(len(new_df.columns)):

        corr_row = []
        
        # Add factor titles in first column
        corr_row.append(new_df.columns[i])

        # Second factor index
        for j in range(len(new_df.columns)):
            
            # First factor column
            col1 = new_df.iloc[:,i]
            # Second factor column
            col2 = new_df.iloc[:,j]
            # Calculate correlation coefficient
            corr, _ = pearsonr(col2, col1)
            
            # Add table cell to row
            corr_row.append(corr)

        # Add row to table
        corr_table.append(corr_row)

    # Convert to dataframe    
    corr_df = pd.DataFrame(corr_table)

    # Export to CSV
    corr_df.to_csv(f'correlations_{years[k]}.csv', index=False, header=False)