In [None]:
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress
import matplotlib.pyplot as plt

In [None]:
# The path to our Happiness Score CSV file
hs_file = "Data/WorldHappinessAll.csv"

# Read our Happiness Score data into pandas
hs_df = pd.read_csv(hs_file)
hs_df.head()

In [None]:
# The path to our Social Progress CSV file
sp_file = "Data/2011-2020-Social-Progress-Index.csv"

# Read our Social Progress data into pandas
sp_df = pd.read_csv(sp_file, encoding = "ISO-8859-1")
sp_df.head()

In [None]:
# Check or NaN values in Happiness Score df
hs_df.count()

In [None]:
# Check for NaN values in Social Progress df
sp_df.count()

In [None]:
# Drop NaN values for Social Progress
nonan_sp_df = sp_df.dropna()
nonan_sp_df.count()

In [7]:
# Note which columns come from which df for post merge file
suffix_hs_df = hs_df.add_suffix("_HS")
suffix_nonan_sp_df = nonan_sp_df.add_suffix("_SP")

In [8]:
# Create target column to merge dataframes on year and country
suffix_nonan_sp_df["Country & Year"] = suffix_nonan_sp_df["SPI year_SP"].astype(str) + suffix_nonan_sp_df["Country_SP"]
suffix_nonan_sp_df.head()

Unnamed: 0,SPI Rank_SP,Country_SP,SPI country code_SP,SPI year_SP,Status_SP,Social Progress Index_SP,Basic Human Needs_SP,Foundations of Wellbeing_SP,Opportunity_SP,Nutrition and Basic Medical Care_SP,...,Equality of political power by socioeconomic position (0=unequal power; 4=equal power)_SP,Equality of political power by social group (0=unequal power; 4=equal power)_SP,Equality of political power by gender (0=unequal power; 4=equal power)_SP,Discrimination and violence against minorities (0=low; 10=high)_SP,Acceptance of gays and lesbians (0=low; 100=high)_SP,Quality weighted universities (points)_SP,Citable documents_SP,Women with advanced education (%)_SP,Years of tertiary schooling_SP,Country & Year
20,54.0,Albania,ALB,2020,Ranked,75.41,86.92,82.71,56.6,92.99,...,0.725,2.568,2.313,4.4,0.08,4.4,0.1717,0.5269,2.7763,2020Albania
21,51.0,Albania,ALB,2019,Ranked,75.7,86.69,82.82,57.58,92.74,...,1.283,2.454,2.323,4.3,0.1,4.4,0.1587,0.5195,2.8688,2019Albania
22,51.0,Albania,ALB,2018,Ranked,75.32,86.2,81.54,58.2,92.38,...,1.671,2.456,2.168,4.2,0.13,4.4,0.1291,0.5051,2.9191,2018Albania
23,52.0,Albania,ALB,2017,Ranked,75.38,86.19,81.53,58.43,92.14,...,1.662,2.579,2.175,4.5,0.11,4.4,0.1391,0.4913,3.1004,2017Albania
24,54.0,Albania,ALB,2016,Ranked,74.69,85.39,80.36,58.31,92.15,...,1.662,2.579,2.175,4.7,0.12,4.4,0.1618,0.4777,3.2892,2016Albania


In [9]:
# Create target column to merge dataframes on year and country
suffix_hs_df["Country & Year"] = suffix_hs_df["Year_HS"].astype(str) + suffix_hs_df["Country or region_HS"]
suffix_hs_df.head()

Unnamed: 0,Year_HS,Overall rank_HS,Country or region_HS,Score_HS,GDP per capita_HS,Social support_HS,Healthy life expectancy_HS,Freedom to make life choices_HS,Generosity_HS,Perceptions of corruption_HS,Dystopia.Residual_HS,Country & Year
0,2019,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,4.054,2019Finland
1,2019,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,3.777,2019Denmark
2,2019,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,3.729,2019Norway
3,2019,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118,3.781,2019Iceland
4,2019,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,3.79,2019Netherlands


In [10]:
# Merge data to only include countries that have data in both Social Progress and Happiness Score dfs
merge_df = pd.merge(suffix_hs_df, suffix_nonan_sp_df, on="Country & Year")
merge_df

Unnamed: 0,Year_HS,Overall rank_HS,Country or region_HS,Score_HS,GDP per capita_HS,Social support_HS,Healthy life expectancy_HS,Freedom to make life choices_HS,Generosity_HS,Perceptions of corruption_HS,...,Satisfied demand for contraception (% of women)_SP,Equality of political power by socioeconomic position (0=unequal power; 4=equal power)_SP,Equality of political power by social group (0=unequal power; 4=equal power)_SP,Equality of political power by gender (0=unequal power; 4=equal power)_SP,Discrimination and violence against minorities (0=low; 10=high)_SP,Acceptance of gays and lesbians (0=low; 100=high)_SP,Quality weighted universities (points)_SP,Citable documents_SP,Women with advanced education (%)_SP,Years of tertiary schooling_SP
0,2019,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,...,90.3,3.195,3.428,2.764,1.2,0.79,42.4,3.5186,0.8919,4.2892
1,2019,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,...,87.6,3.544,3.75,3.318,4.3,0.88,29.8,4.5731,0.9135,4.1745
2,2019,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,...,88.5,3.384,3.396,3.363,3.3,0.9,31.4,4.107,0.9434,4.0184
3,2019,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,...,88.5,3.113,2.772,2.746,4.2,0.9,62.2,3.2448,0.9065,4.1465
4,2019,6,Switzerland,7.48,1.452,1.526,1.052,0.572,0.263,0.343,...,88.7,2.958,3.449,2.866,3.3,0.79,49.0,5.1351,0.9103,2.8077


In [11]:
# Export file as a CSV, without the Pandas index, but with the header
merge_df.to_csv("Data/merge.csv", index=False, header=True)

In [12]:
country_compare_df = merge_df[['Country or region_HS', 'Country_SP']]
country_compare_df.count()

Country or region_HS    371
Country_SP              371
dtype: int64

In [14]:
#We assume "Healthy Life Expectancy" is the greatest Happiness indicator that predicts Social Progress Score. 
#in order to do that we are goint to do correlation test, visualization test, 
#create an organized data frame of Helathy life expectancy, happiness score and social progress score
Healthylife_HS_SP_df = merge_df[["Healthy life expectancy_HS", "Score_HS", "Social Progress Index_SP"]]
Healthylife_HS_SP_df.head(20)

Unnamed: 0,Healthy life expectancy_HS,Score_HS,Social Progress Index_SP
0,0.986,7.769,91.94
1,0.996,7.6,92.08
2,1.028,7.554,93.08
3,0.999,7.488,91.16
4,1.052,7.48,91.52
5,1.009,7.343,91.32
6,1.026,7.307,91.62
7,1.039,7.278,91.26
8,1.016,7.246,89.38
9,1.036,7.228,91.25


In [None]:
# compare mean value for healthy_life_expectancy_HS
Mean_Healthylife_HS_SP = Healthylife_HS_SP_df["Healthy life expectancy_HS"].mean()
Mean_Healthylife_HS_SP



In [None]:
Mean_Score_HS_SP= Healthylife_HS_SP_df["Score_HS"].mean()
Mean_Score_HS_SP

In [15]:
Healthylife_HS_SP_df.describe()


Unnamed: 0,Healthy life expectancy_HS,Score_HS,Social Progress Index_SP
count,371.0,371.0,371.0
mean,0.70371,5.807809,75.776388
std,0.19308,1.059369,12.182156
min,0.0,3.231,47.41
25%,0.6115,5.123,68.71
50%,0.726,5.813,75.61
75%,0.836254,6.6425,86.585
max,1.062,7.769,93.08


In [17]:
X = Healthylife_HS_SP_df[['Healthy life expectancy_HS']]
y = Healthylife_HS_SP_df[['Social Progress Index_SP']]
regr = linear_model.LinearRegression()
regr.fit(X, y)


NameError: name 'linear_model' is not defined

In [None]:
den_file = "Data/population_density.csv"

# Read our Social Progress data into pandas
density_df = pd.read_csv(den_file, encoding = "ISO-8859-1")
density_df

In [None]:
country_df = density_df.loc[density_df['Type'] == 'Country/Area']
rename_country = country_df.rename(columns={'Region, subregion, country or area *':'Country'})

In [None]:
clean_country = rename_country[['Country','2015', '2016', '2017', '2018', '2019']]
clean_country['Country'] = clean_country['Country'].replace({'United States of America':'United States'})
clean_country['Country'] = clean_country['Country'].replace({'United Republic of Tanzania':'Tanzania'})
clean_country['Country'] = clean_country['Country'].replace({'Russian Federation':'Russia'})
density_2015 = clean_country[['Country', '2015']]
density_2016 = clean_country[['Country', '2016']]
density_2017 = clean_country[['Country', '2017']]
density_2018 = clean_country[['Country', '2018']]
density_2019 = clean_country[['Country', '2019']]
density_2019['Country']

In [None]:
density_2015 = density_2015.rename(columns={'2015':'Population Density'})
density_2016 = density_2016.rename(columns={'2016':'Population Density'})
density_2017 = density_2017.rename(columns={'2017':'Population Density'})
density_2018 = density_2018.rename(columns={'2018':'Population Density'})
density_2019 = density_2019.rename(columns={'2019':'Population Density'})
density_2019.loc[density_2019['Country'] == 'Russia']

In [None]:
density_2015['Country & Year'] = ('2015')+density_2015['Country']
density_2016['Country & Year'] = ('2016')+density_2016['Country']
density_2017['Country & Year'] = ('2017')+density_2017['Country']
density_2018['Country & Year'] = ('2018')+density_2018['Country']
density_2019['Country & Year'] = ('2019')+density_2019['Country']
density_2015.loc[(density_2015['Country'] == 'Finland') | 
                 (density_2015['Country'] == 'Denmark') |
                 (density_2015['Country'] == 'Norway')  |
                 (density_2015['Country'] == 'Iceland')]

In [None]:
v1 = density_2015.append(density_2016)

In [None]:
v2 = v1.append(density_2017)

In [None]:
v3 = v2.append(density_2018)

In [None]:
v4 = v3.append(density_2019)

In [None]:
v4

In [None]:
new_merge = merge_df
new_merge['Country & Year'] = new_merge["Year_HS"].astype(str) + new_merge["Country or region_HS"]

In [None]:
combined_merge = pd.merge(v4, new_merge, on="Country & Year", how='right')
combined_merge

In [None]:
pop_den = combined_merge['Population Density'].astype('float64')
hap_score = combined_merge['Score_HS'].astype('float64')
sp_index = combined_merge['Social Progress Index_SP'].astype('float64')

In [None]:
slope, intercept, rvalue, pvalue, stderr = linregress(pop_den, hap_score)
regress_values = pop_den * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(pop_den, hap_score, facecolors = 'lightblue', edgecolors = 'black', s = 40)
plt.plot(pop_den,regress_values,"r-")
plt.annotate(line_eq,(150,4.5),fontsize=15,color="red")
plt.title("Population Density Vs Happiness Score")
plt.xlabel("Population Density (persons per square km)")
plt.ylabel("Happiness Score")
print(f" The R Value is:{rvalue}")
plt.show()

In [None]:
slope, intercept, rvalue, pvalue, stderr = linregress(pop_den, sp_index)
regress_values = pop_den * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(pop_den, sp_index, facecolors = 'lightblue', edgecolors = 'black', s = 40)
plt.plot(pop_den,regress_values,"r-")
plt.annotate(line_eq,(300,60),fontsize=15,color="red")
plt.title("Population Density Vs Social Progress Score")
plt.xlabel("Population Density (persons per square km)")
plt.ylabel("Social Progress Score")
print(f" The R Value is:{rvalue}")
plt.show()