In [None]:
#Import Libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

#File paths for CSV files
happiness_path = "Resources/world-happiness-report-2019 kaggle.csv"
pop_density_path = "Resources/world population density data 2019.csv"

#Load dataframes
happiness_df = pd.read_csv(happiness_path)
pop_density_df = pd.read_csv(pop_density_path)

In [None]:
print(happiness_df.info(), "\n")
print(pop_density_df.info())

In [None]:
happiness_df.head()

In [None]:
pop_density_df.head()

In [None]:
#Cleaning Data
happiness_df.rename(columns={"Country (region)" : "Country"}, inplace=True)
pop_density_df.rename(columns={"name" : "Country"}, inplace=True)
#Removing countries that have incomplete data per row 
happiness_df.dropna(inplace=True,how="any")
pop_density_df.dropna(inplace=True,how="any")
#Sort by Ladder 
happiness_df=happiness_df.sort_values(by='Ladder').reset_index(drop=True)
#Merging Datasets
merged_df = pd.merge(happiness_df, pop_density_df, on="Country")
#Sort by Ladder
merged_df=merged_df.sort_values(by='Ladder').reset_index(drop=True)
merged_df.head()

In [None]:
merged_df.info()

In [None]:
#Data frame with values we care about
density_df = pd.DataFrame(merged_df, columns = ['Country', 'Ladder', 'density', 'pop2019', 'area'])
density_df.head()

In [None]:
#grouping by max density
max_df = merged_df.groupby(['Country', 'pop2019'])['density'].max()
max_df.head()


In [None]:
#grouping by max ladder
happiness_score_df = merged_df.groupby(['Country', 'density'])['Ladder'].max()
happiness_chart_df = pd.DataFrame(happiness_score_df)
happiness_chart_df.head()

In [None]:
#Finding the R-score for Density
xlist = merged_df['Ladder'].tolist()
ylist = merged_df['density'].tolist()
slope, intercept, r_value, p_value, std_err = stats.linregress(xlist, ylist)
ypred = []

for x in xlist:
    y = slope * x + intercept
    ypred.append(y)


r2_score(ylist, ypred)

In [None]:
#initiate the linear regression 
lm=LinearRegression()
#fit on the data. It finds the line that minimizes the difference between the data points and the regression line
model_fit=lm.fit(merged_df['Ladder'].values.reshape(-1,1),merged_df['density'].values.reshape(-1,1))

f'slope: {model_fit.coef_},intercept: {model_fit.intercept_}'

In [None]:
#Fitted line
preds=model_fit.predict(merged_df['Ladder'].values.reshape(-1,1))
#reformat to plot it
preds=[i for j in preds for i in j ]
plt.scatter(merged_df['Ladder'],merged_df['density'])
plt.grid()
plt.title("Density vs Ladder")
plt.ylabel("Density")
plt.xlabel("Ladder")
plt.plot(preds,c='r')

In [None]:
#scatter plot
plt.scatter(merged_df["Ladder"], merged_df["density"], edgecolors="black")
plt.grid()

plt.title("Density vs Ladder")
plt.ylabel("density")
plt.xlabel("Ladder")
plt.show()

In [None]:
#Finding the R-score for Population 2019
xlist = merged_df['Ladder'].tolist()
ylist = merged_df['pop2019'].tolist()
slope, intercept, r_value, p_value, std_err = stats.linregress(xlist, ylist)
ypred = []

for x in xlist:
    y = slope * x + intercept
    ypred.append(y)


r2_score(ylist, ypred)

In [None]:
#initiate the linear regression 
lm=LinearRegression()
#fit on the data. It finds the line that minimizes the difference between the data points and the regression line
model_fit=lm.fit(merged_df['Ladder'].values.reshape(-1,1),merged_df['pop2019'].values.reshape(-1,1))

f'slope: {model_fit.coef_},intercept: {model_fit.intercept_}'

In [None]:
#Fitted line
preds=model_fit.predict(merged_df['Ladder'].values.reshape(-1,1))
#reformat to plot it
preds=[i for j in preds for i in j ]
plt.scatter(merged_df['Ladder'],merged_df['pop2019'])
plt.grid()
plt.title("Population 2019 vs Ladder")
plt.ylabel("Population 2019")
plt.xlabel("Ladder")
plt.plot(preds,c='r')

In [None]:
#scatter plot
plt.scatter(merged_df["Ladder"], merged_df["pop2019"], edgecolors="black")
plt.grid()

plt.title("Population 2019 vs Ladder")
plt.ylabel("pop2019")
plt.xlabel("Ladder")
plt.show()

In [None]:
#Finding the R-score for Freedom
xlist = happiness_df['Ladder'].tolist()
ylist = happiness_df['Freedom'].tolist()
slope, intercept, r_value, p_value, std_err = stats.linregress(xlist, ylist)
ypred = []

for x in xlist:
    y = slope * x + intercept
    ypred.append(y)


r2_score(ylist, ypred)

In [None]:
#initiate the linear regression 
lm=LinearRegression()
#fit on the data. It finds the line that minimizes the difference between the data points and the regression line
model_fit=lm.fit(happiness_df['Ladder'].values.reshape(-1,1),happiness_df['Freedom'].values.reshape(-1,1))

f'slope: {model_fit.coef_},intercept: {model_fit.intercept_}'

In [None]:
#Fitted line
preds=model_fit.predict(happiness_df['Ladder'].values.reshape(-1,1))
#reformat to plot it
preds=[i for j in preds for i in j ]
plt.scatter(happiness_df['Ladder'],happiness_df['Freedom'])
plt.grid()
plt.title("Freedom vs Ladder")
plt.ylabel("Freedom")
plt.xlabel("Ladder")
plt.plot(preds,c='r')

In [None]:
#scatter plot
plt.scatter(happiness_df["Ladder"], happiness_df["Freedom"], edgecolors="black")
plt.grid()

plt.title("Freedom vs Ladder")
plt.ylabel("Freedom")
plt.xlabel("Ladder")
plt.show()

In [None]:
#Finding the R-score for Social Support

xlist = happiness_df['Ladder'].tolist()
ylist = happiness_df['Social support'].tolist()
slope, intercept, r_value, p_value, std_err = stats.linregress(xlist, ylist)
ypred = []

for x in xlist:
    y = slope * x + intercept
    ypred.append(y)


r2_score(ylist, ypred)

In [None]:
#initiate the linear regression 
lm=LinearRegression()
#fit on the data. It finds the line that minimizes the difference between the data points and the regression line
model_fit=lm.fit(happiness_df['Ladder'].values.reshape(-1,1),happiness_df['Social support'].values.reshape(-1,1))

f'slope: {model_fit.coef_},intercept: {model_fit.intercept_}'

In [None]:
#Fitted line
preds=model_fit.predict(happiness_df['Ladder'].values.reshape(-1,1))
#reformat to plot it
preds=[i for j in preds for i in j ]
plt.scatter(happiness_df['Ladder'],happiness_df['Social support'])
plt.grid()
plt.title("Social Support vs Ladder")
plt.ylabel("Social Support")
plt.xlabel("Ladder")
plt.plot(preds,c='r')

In [None]:
#scatter plot
plt.scatter(happiness_df["Ladder"], happiness_df["Social support"], edgecolors="black")
plt.grid()

plt.title("Social support vs Ladder")
plt.ylabel("Social support")
plt.xlabel("Ladder")
plt.show()

In [None]:
#Finding the R-score for Healthy Life Expectancy
xlist = happiness_df['Ladder'].tolist()
ylist = happiness_df['Healthy life\nexpectancy'].tolist()
slope, intercept, r_value, p_value, std_err = stats.linregress(xlist, ylist)
ypred = []

for x in xlist:
    y = slope * x + intercept
    ypred.append(y)


r2_score(ylist, ypred)

In [None]:
#initiate the linear regression 
lm=LinearRegression()
#fit on the data. It finds the line that minimizes the difference between the data points and the regression line
model_fit=lm.fit(happiness_df['Ladder'].values.reshape(-1,1),happiness_df['Healthy life\nexpectancy'].values.reshape(-1,1))
 
f'slope: {model_fit.coef_},intercept: {model_fit.intercept_}'

In [None]:
#Fitted line
preds=model_fit.predict(happiness_df['Ladder'].values.reshape(-1,1))
#reformat to plot it
preds=[i for j in preds for i in j ]
plt.scatter(happiness_df['Ladder'],happiness_df['Healthy life\nexpectancy'])
plt.grid()
plt.title("Healthy Life Expectancy vs Ladder")
plt.ylabel("Healthy Life Expectancy")
plt.xlabel("Ladder")
plt.plot(preds,c='r')

In [None]:
#scatter plot
plt.scatter(happiness_df["Ladder"], happiness_df["Healthy life\nexpectancy"], edgecolors="black")
plt.grid()

plt.title("Health Expectancy vs Ladder")
plt.ylabel("Health Expectancy")
plt.xlabel("Ladder")
plt.show()

In [None]:
#Finding the R-score for Corruption
xlist = happiness_df['Ladder'].tolist()
ylist = happiness_df['Corruption'].tolist()
slope, intercept, r_value, p_value, std_err = stats.linregress(xlist, ylist)
ypred = []

for x in xlist:
    y = slope * x + intercept
    ypred.append(y)


r2_score(ylist, ypred)

In [None]:
#initiate the linear regression 
lm=LinearRegression()
#fit on the data. It finds the line that minimizes the difference between the data points and the regression line
model_fit=lm.fit(happiness_df['Ladder'].values.reshape(-1,1),happiness_df['Corruption'].values.reshape(-1,1))

f'slope: {model_fit.coef_},intercept: {model_fit.intercept_}'

In [None]:
#Fitted line
preds=model_fit.predict(happiness_df['Ladder'].values.reshape(-1,1))
#reformat to plot it
preds=[i for j in preds for i in j ]
plt.scatter(happiness_df['Ladder'],happiness_df['Corruption'])
plt.grid()
plt.title("Corruption vs Ladder")
plt.ylabel("Corruption")
plt.xlabel("Ladder")
plt.plot(preds,c='r')

In [None]:
#scatter plot
plt.scatter(happiness_df["Ladder"], happiness_df["Corruption"], edgecolors="black")
plt.grid()

plt.title("Corruption vs Ladder")
plt.ylabel("Corruption")
plt.xlabel("Ladder")
plt.show()

In [None]:
#Finding the R-score for GDP per Capita
xlist = happiness_df['Ladder'].tolist()
ylist = happiness_df['Log of GDP\nper capita'].tolist()
slope, intercept, r_value, p_value, std_err = stats.linregress(xlist, ylist)
ypred = []

for x in xlist:
    y = slope * x + intercept
    ypred.append(y)


r2_score(ylist, ypred)

In [None]:
#initiate the linear regression 
lm=LinearRegression()
#fit on the data. It finds the line that minimizes the difference between the data points and the regression line
model_fit=lm.fit(happiness_df['Ladder'].values.reshape(-1,1),happiness_df['Log of GDP\nper capita'].values.reshape(-1,1))

f'slope: {model_fit.coef_},intercept: {model_fit.intercept_}'


In [None]:
#Fitted line
preds=model_fit.predict(happiness_df['Ladder'].values.reshape(-1,1))
#reformat to plot it
preds=[i for j in preds for i in j ]
plt.scatter(happiness_df['Ladder'],happiness_df['Log of GDP\nper capita'])
plt.grid()
plt.title("Log of GDP\nper capita vs Ladder")
plt.ylabel("Log of GDP\nper capita")
plt.xlabel("Ladder")
plt.plot(preds,c='r')


In [None]:
#scatter plot
plt.scatter(happiness_df["Ladder"], happiness_df["Log of GDP\nper capita"], edgecolors="black")
plt.grid()

plt.title("Log of GDP per capita vs Ladder")
plt.ylabel("Log of GDP\nper capita")
plt.xlabel("Ladder")
plt.show()

In [None]:
#Finding the R-score for Generosity
xlist = happiness_df['Ladder'].tolist()
ylist = happiness_df['Generosity'].tolist()
slope, intercept, r_value, p_value, std_err = stats.linregress(xlist, ylist)
ypred = []

for x in xlist:
    y = slope * x + intercept
    ypred.append(y)


r2_score(ylist, ypred)

In [None]:
#initiate the linear regression 
lm=LinearRegression()
#fit on the data. It finds the line that minimizes the difference between the data points and the regression line
model_fit=lm.fit(happiness_df['Ladder'].values.reshape(-1,1),happiness_df['Generosity'].values.reshape(-1,1))

f'slope: {model_fit.coef_},intercept: {model_fit.intercept_}'

In [None]:
#Fitted line
preds=model_fit.predict(happiness_df['Ladder'].values.reshape(-1,1))
#reformat to plot it
preds=[i for j in preds for i in j ]
plt.scatter(happiness_df['Ladder'],happiness_df['Generosity'])
plt.grid()
plt.title("Generosity vs Ladder")
plt.ylabel("Generosity")
plt.xlabel("Ladder")
plt.plot(preds,c='r')

In [None]:
#scatter plot
plt.scatter(happiness_df["Ladder"], happiness_df["Generosity"], edgecolors="black")
plt.grid()

plt.title("Generosity vs Ladder")
plt.ylabel("Generosity")
plt.xlabel("Ladder")
plt.show()