In [None]:
import os
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress
import requests
import json

In [None]:
#file path
#Import clean csv file
project_df_path = "Data/merge.csv"
project_df = pd.read_csv(project_df_path)
project_df.head()

In [None]:
#see dataframe column as a list
#project_df.columns.tolist()

In [None]:
#number of participating countries
project_df["Country or region_HS"].unique()

In [None]:
#suppress warning
import warnings
warnings.simplefilter("ignore")

In [None]:
#Get ranking and totals data
totals_df = project_df[['Year_HS', 'Country_SP', 'Overall rank_HS', 'Score_HS', 'SPI Rank_SP',  'Social Progress Index_SP']]

#Rename year and country columns
totals_df.rename(columns = {"Year_HS":"Year", "Country_SP":"Country"}, inplace = True)

#configure SPI ranking to not have a decimal
totals_df["SPI Rank_SP"] = totals_df["SPI Rank_SP"].astype(int)

#renaming columns to get rid of spaces
totals_df.rename(columns={"Overall rank_HS":"Rank_HS"}, inplace=True)
totals_df.rename(columns={"SPI Rank_SP":"Rank_SPI"}, inplace=True)

In [None]:
#Top Rankings of Happiness Score
#ranking_HS = totals_df.sort_values(["Year","Rank_HS"])
#ranking_HS

In [None]:
#Top Rankings of Social Progress Index
#ranking_SP = totals_df.sort_values(["Year","Rank_SPI"])
#ranking_SP

In [None]:
ranking_both_top10 = totals_df[(totals_df["Rank_HS"] <= 10) & (totals_df["Rank_SPI"] <= 10)]
#use \ (vertical bar for either condiation)
ranking_both_top10

In [None]:
ranking_HS = ranking_both_top10.sort_values(["Rank_HS"])

In [None]:
ranking_SPI = ranking_both_top10.sort_values(["Rank_SPI"])
#ranking_SPI

In [None]:
ranking_country = ranking_both_top10.sort_values(["Country"])
ranking_country

In [None]:
ranking_top10 = ranking_both_top10.groupby(["Country","Rank_HS","Rank_SPI"])
ranking_top10.count()

In [None]:
count10 = ranking_top10["Rank_SPI"].count()
count10

In [None]:
#Scatterplot all years 
x_values = ranking_both_top10["Rank_HS"]
y_values = ranking_both_top10["Rank_SPI"]

In [None]:
plt.title("Top 10 Social Progress and Happiness Rankings (2015-2019)")
plt.ylabel(" Social Progress Rank")
plt.xlabel("Happiness Rank")
plt.ylim(0,11)
plt.xlim(0,11)
plt.grid(True, linestyle="-", which="major", color="gray", alpha=0.25)

plt.scatter(x_values, y_values, color="black", marker="o")
plt.show()


In [None]:
#plot_country = ranking_country["Country"].to_list()
#plot_HS_rank = ranking_country["Rank_HS"].to_list()
#plot_SPI_rank = ranking_country["Rank_SPI"].to_list()
#plot_HS_rank
#plot_SPI_rank
#plot_country

In [None]:
#Create variable for list of ranking data
plot_HS_rank = ranking_country["Rank_HS"].to_list()

plot_SPI_rank = ranking_country["Rank_SPI"].to_list()

country = ranking_country["Country"].to_list()


In [None]:
#Create variable to add and inverse the rankings so the smallest number for the ranking displays a larger dot using list comprehension.
#https://www.geeksforgeeks.org/python-adding-two-list-elements/
#display_dot = [plot_HS_rank[i] + plot_SPI_rank[i] for i in range(len(plot_HS_rank))]

display_dot = [float(plot_HS_rank[i]) + float(plot_SPI_rank[i]) for i in range(len(plot_HS_rank))]

#Find reciprocal
#https://www.geeksforgeeks.org/numpy-reciprocal
display_dot_inv = np.reciprocal(display_dot)

#display_dot
#display_dot_inv

In [None]:
#Scatterplot of the two rankings with cities that rank 1-10 in both.
#https://stackoverflow.com/questions/26139423/plot-different-color-for-different-categorical-levels-using-matplotlib
#https://python-graph-gallery.com/270-basic-bubble-plot/

df = pd.DataFrame(dict(plot_HS_rank=plot_HS_rank, plot_SPI_rank=plot_SPI_rank, country=country, display_dot_inv=display_dot_inv))
fig, ax = plt.subplots()
plt.rcParams["figure.figsize"] = (7,7)

colors = {'Australia':'crimson', 'Canada':'darkviolet', 'Denmark':'green', 'Finland':'deepskyblue', 'Netherlands':'gold', 'New Zealand':'orange', 'Norway':'mediumblue', 'Sweden':'teal', 'Switzerland':'yellowgreen'}
scatter = ax.scatter(df['plot_HS_rank'], df['plot_SPI_rank'], s=df['display_dot_inv']*2500, c=df['country'].apply(lambda x: colors[x]))
legend1 = ax.legend(*scatter.legend_elements(), loc="lower right", title="Countries")

plt.ylim(-1,11)
plt.xlim(-1,11)
plt.title("Countries that Rank in the Top 10 in Both \n Social Progress and Happiness Rankings (2015-2019)", fontsize=15)
plt.ylabel(" Social Progress Rank", fontsize=12)
plt.xlabel("Happiness Rank", fontsize=12)
plt.grid(True, linestyle="-", which="major", color="gray", alpha=0.25)

#plt.savefig("Data/TopTenRanking.png")
plt.show()

In [None]:
groups = ranking_country.groupby("Country")
colors = {'Australia':'crimson', 'Canada':'darkviolet', 'Denmark':'green', 'Finland':'deepskyblue', 'Netherlands':'gold', 'New Zealand':'orange', 'Norway':'mediumblue', 'Sweden':'teal', 'Switzerland':'yellowgreen'}

for name, group in groups:
    plt.plot(group["Rank_HS"], group["Rank_SPI"], marker="o", markersize=20, linestyle="", label=name,)

plt.rcParams["figure.figsize"] = (7,7)
plt.rcParams["legend.markerscale"] = 0.4

plt.ylim(-1,11)
plt.xlim(-1,11)
plt.title("Countries that Rank in the Top 10 in Both \n Social Progress and Happiness Rankings (2015-2019)", fontsize=15)
plt.ylabel(" Social Progress Rank", fontsize=12)
plt.xlabel("Happiness Rank", fontsize=12)
plt.grid(True, linestyle="-", which="major", color="gray", alpha=0.25)
plt.legend()

#plt.savefig("Data/TopTenRanking-alternate.png")
plt.show()

In [None]:
#Scatterplot all years 
x_values = totals_df["Score_HS"]
y_values = totals_df["Social Progress Index_SP"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = slope * x_values + intercept
line_eq = "y =" + str(round(slope,2)) + "x " + str(round(intercept,2))

plt.rcParams["figure.figsize"] = (7,7)

plt.title("Social Progress Index and Happiness Scores \n for All Countries (2015-2019)",  fontsize=15)
plt.ylabel(" Social Progress Index",  fontsize=12)
plt.xlabel("Happiness Score",  fontsize=12)
plt.ylim(20,100)
plt.xlim(0,10)
plt.grid(True, linestyle="-", which="major", color="gray", alpha=0.25)
plt.annotate(line_eq, (2,30), fontsize=12, color="red")

plt.scatter(x_values, y_values, color="black", marker="o")
plt.plot(x_values, regress_values, "r-")

#plt.savefig("Data/HP_SPI_scatter.png")

plt.show()

#print r squared
print(f"R squared: {rvalue**2}")

In [None]:
#Group by country
grouped_totals_df = totals_df.groupby(["Country"])
grouped_totals_df.mean()

In [None]:
#Converting groupby to a dataframe
averages_df = pd.DataFrame(grouped_totals_df["Rank_HS", "Score_HS", "Rank_SPI", "Social Progress Index_SP"].mean())

In [None]:
#Scatterplot averaged data 
x_values = averages_df["Score_HS"]
y_values = averages_df["Social Progress Index_SP"]

plt.rcParams["figure.figsize"] = (7,7)

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = slope * x_values + intercept
line_eq = "y =" + str(round(slope,2)) + "x " + str(round(intercept,2))

plt.title("5-year Average Social Progress Index \n and Happiness Score for Each Country",  fontsize=15)
plt.ylabel(" Social Progress Index",  fontsize=12)
plt.xlabel("Happiness Score",  fontsize=12)
plt.ylim(20,100)
plt.xlim(0,10)
plt.grid(True, linestyle="-", which="major", color="gray", alpha=0.25)
plt.annotate(line_eq, (2,30), fontsize=12, color="red")

plt.scatter(x_values, y_values, color="black", marker="o")
plt.plot(x_values, regress_values, "r-")

#plt.savefig("Data/HP_SPI_5average_scatter.png")

plt.show()

#print r squared
print(f"R squared: {rvalue**2}")

# Analysis of Relationships between Social Progress Indicators and Happiness Score

### Basic Human Needs vs. Happiness Linear Regression

In [None]:
# Add the linear regression equation and line to plot
y_values = project_df['Score_HS']
x_values = project_df['Basic Human Needs_SP']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, edgecolors= "black")
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(40,6),fontsize=15,color="red")

# Labels
plt.title('Basic Human Needs vs. Happiness Linear Regression')
plt.ylabel('Hapiness Score')
plt.xlabel('Basic Human Needs Score')
plt.grid()
plt.show()

In [None]:
score = f"The r value is: {rvalue}"
if rvalue == 0:
    relationship = "none existant"
    strength = ""
else:
    if rvalue > 0:
        relationship = "positive"
    else:
        relationship = "negative"

if abs(rvalue) >= .7:
    strength = "strong"
elif abs(rvalue) >= .5:
    strength = "moderate"
elif abs(rvalue) >= .3 and rvalue != 0:
    strength = "weak"
    
print(f"The r value is: {rvalue}.  This is a {strength} {relationship} relationship")

### Foundations of Wellbeing vs Happiness Linear Regression

In [None]:
# Add the linear regression equation and line to plot
y_values = project_df['Score_HS']
x_values = project_df['Foundations of Wellbeing_SP']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, edgecolors= "black")
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(70,4),fontsize=15,color="red")

# Labels
plt.title('Foundations of Wellbeing  vs. Happiness Linear Regression')
plt.ylabel('Hapiness Score')
plt.xlabel('Foundations of Wellbeing Score')
plt.grid()
plt.show()

In [None]:
score = f"The r value is: {rvalue}"
if rvalue == 0:
    relationship = "none existant"
    strength = ""
else:
    if rvalue > 0:
        relationship = "positive"
    else:
        relationship = "negative"

if abs(rvalue) >= .7:
    strength = "strong"
elif abs(rvalue) >= .5:
    strength = "moderate"
elif abs(rvalue) >= .3 and rvalue != 0:
    strength = "weak"
    
print(f"The r value is: {rvalue}.  This is a {strength} {relationship} relationship")

### Opportunity vs. Happiness Linear Regression

In [None]:
# Add the linear regression equation and line to plot
y_values = project_df['Score_HS']
x_values = project_df['Opportunity_SP']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, edgecolors= "black")
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(60,3.5),fontsize=15,color="red")

# Labels
plt.title('Opportunity  vs. Happiness Linear Regression')
plt.ylabel('Hapiness Score')
plt.xlabel('Foundations of Wellbeing Score')
plt.grid()
plt.show()

In [None]:
score = f"The r value is: {rvalue}"
if rvalue == 0:
    relationship = "none existant"
    strength = ""
else:
    if rvalue > 0:
        relationship = "positive"
    else:
        relationship = "negative"

if abs(rvalue) >= .7:
    strength = "strong"
elif abs(rvalue) >= .5:
    strength = "moderate"
elif abs(rvalue) >= .3 and rvalue != 0:
    strength = "weak"
    
print(f"The r value is: {rvalue}.  This is a {strength} {relationship} relationship")

### Happiness Linear Regression Analysis

Each of the r-values found in this analysis showed a strong positive correlation of around .73-.74.  

This shows us that Basic Human Needs, Foundations of Wellness, and Opportunity (as defined by the Social Progress Index) all increases while happiness increases.  A further analysis to find more interesting correlations in the data was conducted below by looking at all Social Progress Indicators instead of just the high level categories looked at here.

# Finding Social Progress Indicator Sub-metrics with Strongest Correlation to Happiness Score

In [None]:
# Create lists for storage of information
sub_metric_list = []
rvalue_list = []

# Select only columns relating to Social Progress Indicators
submetric_df = project_df.iloc[:, 22:84]

# Iterate through the submetric data frame and store rvalues and submetric label for each
for column in submetric_df:
    submetric_string = column[:-3]
    y_values = project_df['Score_HS']
    x_values = project_df[column]
    (slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
    sub_metric_list.append(submetric_string)
    rvalue_list.append(rvalue)
    
# Store lists in a dictionary
submetric_results_dict = {
    'Sub Metric' : sub_metric_list,
    'R Value' : rvalue_list
}

# Create a dataframe with the dictionary
submetric_results_df = pd.DataFrame(data = submetric_results_dict)

### Positive Relationship Results

In [None]:
positive_results_df = submetric_results_df.loc[submetric_results_df['R Value'] >= 0]
weak_pos_results_df = positive_results_df.sort_values(by=['R Value'])
strong_pos_results_df = positive_results_df.sort_values(by=['R Value'], ascending = False)

<b>Strongest Positive Indicators of Happiness</b>

In [None]:
strong_pos_results_df.head()

<b>Weakest Positive Indicators of Happiness</b>

In [None]:
weak_pos_results_df.head()

### Negative Relationship Results

In [None]:
negative_results_df = submetric_results_df.loc[submetric_results_df['R Value'] < 0]
strong_neg_results_df = negative_results_df.sort_values(by=['R Value'])
weak_neg_results_df = negative_results_df.sort_values(by=['R Value'], ascending = False)

<b>Strongest Negative Indicators of Happiness</b>

In [None]:
strong_neg_results_df.head()

<b>Weakest Negative Indicators of Happiness</b>

In [None]:
weak_neg_results_df.head()

### Analysis of Sub-metric Linear Regressions

The 3 metrics with the strongest positive correlation with happiness scores are:
- Life Expectency at 60
- Access to information and communications
- Overall Health and Wellness of the country

The 3 metrics with the strongest negative correlation with happiness scores are:
- Household air pollution attributable deaths
- Populations (%) using unsafe or unimproved sanitation
- Vulnerable employment (% of employees)

The 3 metrics with the weakest positive correlation with happiness scores are:
- Greenhouse gas emissions
- Biome protection
- Quality universities

The 3 metrics with the weakest negative correlation with happiness scores are:
- Homicide rate
- Perceived criminality
- Early marriage (% of women)

From this, our final step was to take the 3 strongest metrics and see how well they could be used to predict happiness score in a single variable linear regression.

In [None]:
# r squared of Life Expectency and Happiness Score
y_values = project_df['Score_HS']
x_values = project_df['Life expectancy at 60 (years)_SP']
correlation_matrix = np.corrcoef(x_values, y_values)
correlation_xy = correlation_matrix[0,1]
r_squared = correlation_xy**2
print(f"The r_squared for Life Expectency and Happiness Score is {r_squared}")
print("")

# r squared of Access to Information and Communications
y_values = project_df['Score_HS']
x_values = project_df['Access to Information and Communications_SP']
correlation_matrix = np.corrcoef(x_values, y_values)
correlation_xy = correlation_matrix[0,1]
r_squared = correlation_xy**2
print(f"The r_squared for Access to Information and Communications and Happiness Score is {r_squared}")
print("")

# r squared of Household air pollution attributable deaths and Happiness Score
y_values = project_df['Score_HS']
x_values = project_df['Household air pollution attributable deaths (deaths/100,000)_SP']
correlation_matrix = np.corrcoef(x_values, y_values)
correlation_xy = correlation_matrix[0,1]
r_squared = correlation_xy**2
print(f"The r_squared for Household air pollution attributable deaths and Happiness Score is {r_squared}")
print("")

The above shows each of these strongly correlated values has an r-squared of above .57. 

This means each of these variables can attribute to predicting at least 57% of the value of the Happiness Score, given a single variable linear regression model, with the other 43% (or less) of the value being attributable to other factors. 

# Analysis of Relationships between Happiness Indicators and Social Progress Score

In [None]:
# Add the linear regression equation and line to plot
x_values = project_df['Social Progress Index_SP']
y_values = project_df['GDP per capita_HS']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, edgecolors= "black")
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(60, .3),fontsize=15,color="red")

# Labels
plt.title('GDP per Capita vs. Social Progress Index Score')
plt.xlabel('Social Progress Index Score')
plt.ylabel('GDP per Capita')
plt.grid()
plt.show()

In [None]:
score = f"The r value is: {rvalue}"
if rvalue == 0:
    relationship = "none existant"
    strength = ""
else:
    if rvalue > 0:
        relationship = "positive"
    else:
        relationship = "negative"

if abs(rvalue) >= .7:
    strength = "strong"
elif abs(rvalue) >= .5:
    strength = "moderate"
elif abs(rvalue) >= .3 and rvalue != 0:
    strength = "weak"
    
print(f"The r value is: {rvalue}.  This is a {strength} {relationship} relationship")

# Population Density vs Social Progress & Happiness Score

In [None]:
den_file = "Data/population_density.csv"

# Read our Social Progress data into pandas
density_df = pd.read_csv(den_file, encoding = "ISO-8859-1")
density_df

In [None]:
country_df = density_df.loc[density_df['Type'] == 'Country/Area']
rename_country = country_df.rename(columns={'Region, subregion, country or area *':'Country'})

In [None]:
clean_country = rename_country[['Country','2015', '2016', '2017', '2018', '2019']]
clean_country['Country'] = clean_country['Country'].replace({'United States of America':'United States'})
clean_country['Country'] = clean_country['Country'].replace({'United Republic of Tanzania':'Tanzania'})
clean_country['Country'] = clean_country['Country'].replace({'Russian Federation':'Russia'})
density_2015 = clean_country[['Country', '2015']]
density_2016 = clean_country[['Country', '2016']]
density_2017 = clean_country[['Country', '2017']]
density_2018 = clean_country[['Country', '2018']]
density_2019 = clean_country[['Country', '2019']]
density_2019['Country']

In [None]:
density_2015 = density_2015.rename(columns={'2015':'Population Density'})
density_2016 = density_2016.rename(columns={'2016':'Population Density'})
density_2017 = density_2017.rename(columns={'2017':'Population Density'})
density_2018 = density_2018.rename(columns={'2018':'Population Density'})
density_2019 = density_2019.rename(columns={'2019':'Population Density'})
density_2019.loc[density_2019['Country'] == 'Russia']

In [None]:
density_2015['Country & Year'] = ('2015')+density_2015['Country']
density_2016['Country & Year'] = ('2016')+density_2016['Country']
density_2017['Country & Year'] = ('2017')+density_2017['Country']
density_2018['Country & Year'] = ('2018')+density_2018['Country']
density_2019['Country & Year'] = ('2019')+density_2019['Country']
density_2015.loc[(density_2015['Country'] == 'Finland') | 
                 (density_2015['Country'] == 'Denmark') |
                 (density_2015['Country'] == 'Norway')  |
                 (density_2015['Country'] == 'Iceland')]

In [None]:
v1 = density_2015.append(density_2016)

In [None]:
v2 = v1.append(density_2017)

In [None]:
v3 = v2.append(density_2018)

In [None]:
v4 = v3.append(density_2019)

In [None]:
v4

In [None]:
new_merge = project_df
new_merge['Country & Year'] = new_merge["Year_HS"].astype(str) + new_merge["Country or region_HS"]

In [None]:
combined_merge = pd.merge(v4, new_merge, on="Country & Year", how='right')
combined_merge

In [None]:
pop_den = combined_merge['Population Density'].astype('float64')
hap_score = combined_merge['Score_HS'].astype('float64')
sp_index = combined_merge['Social Progress Index_SP'].astype('float64')

In [None]:
slope, intercept, rvalue, pvalue, stderr = linregress(pop_den, hap_score)
regress_values = pop_den * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(pop_den, hap_score, facecolors = 'lightblue', edgecolors = 'black', s = 40)
plt.plot(pop_den,regress_values,"r-")
plt.annotate(line_eq,(150,4.5),fontsize=15,color="red")
plt.title("Population Density Vs Happiness Score")
plt.xlabel("Population Density (persons per square km)")
plt.ylabel("Happiness Score")
print(f" The R Value is:{rvalue}")
plt.show()

In [None]:
slope, intercept, rvalue, pvalue, stderr = linregress(pop_den, sp_index)
regress_values = pop_den * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(pop_den, sp_index, facecolors = 'lightblue', edgecolors = 'black', s = 40)
plt.plot(pop_den,regress_values,"r-")
plt.annotate(line_eq,(300,60),fontsize=15,color="red")
plt.title("Population Density Vs Social Progress Score")
plt.xlabel("Population Density (persons per square km)")
plt.ylabel("Social Progress Score")
print(f" The R Value is:{rvalue}")
plt.show()