In [18]:
# Import required libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

In [19]:
# Create SQLite connection
happiness_path = Path('../Resources/HappinessIndexScore.sqlite')
engine = create_engine(f'sqlite:///{happiness_path}')

In [20]:
# Create df
df_healthy_life = pd.read_sql('SELECT * FROM final_output', con=engine)
df_healthy_life.head()

Unnamed: 0,country,region,ladder_score,logged_GPD_per_capita,social_support,healthy_life_expectancy,freedom_life_choices,generosity,perceptions_corruption,population_density,unemployment_rate,median_age,gini_coefficient,avg_temperature,lt_alcohol_per_capita
0,Finland,Europe,7.804,10.792,0.969,71.15,0.961,-0.019,0.182,16.6,7.16,43.2,27.7,3.24,8.23
1,Denmark,Europe,7.586,10.962,0.954,71.25,0.934,0.134,0.196,138.0,5.14,42.2,27.7,9.77,9.16
2,Iceland,Europe,7.53,10.896,0.983,72.05,0.936,0.211,0.668,3.5,3.56,37.8,26.1,2.11,7.72
3,Israel,Middle East,7.473,10.639,0.943,72.697,0.809,-0.023,0.708,412.24,3.39,30.1,38.6,20.23,3.07
4,Netherlands,Europe,7.403,10.942,0.93,71.55,0.887,0.213,0.379,420.38,3.56,42.2,29.2,11.72,8.23


In [21]:
# Create a scatter plot of health versus the ladder score information
healthy_plot = df_healthy_life.hvplot.scatter(
    x="healthy_life_expectancy",
    y="ladder_score",
    title="Ladder_score vs. health expectancy"
)
healthy_plot

In [22]:
# Reformat data of the independent variable X as a single-column array
X = df_healthy_life["healthy_life_expectancy"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[71.15 ],
       [71.25 ],
       [72.05 ],
       [72.697],
       [71.55 ]])

In [23]:
# The shape of X is 30 samples, with a single feature (column)
X.shape

(123, 1)

In [24]:
# Create an array for the dependent variable y
y = df_healthy_life["ladder_score"]

In [25]:
# Create a model with scikit-learn
model = LinearRegression()

In [26]:
# Fit the data into the model
model.fit(X, y)

In [27]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [0.15798996]


In [28]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = -4.6983416747446975 + 0.1579899559184014X


In [29]:

print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}")

y_7 = model.intercept_ + model.coef_[0] 

# Display the prediction
print(f"{y_7:.2f}")

Model's formula: y = -4.6983416747446975 + 0.1579899559184014
-4.54


In [30]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [31]:
# Create a copy of the original data
df_healthy_life_predicted = df_healthy_life.copy()

# Add a column with the predicted ladder score values
df_healthy_life_predicted["ladder_score_predicted"] = predicted_y_values

# Display sample data
df_healthy_life_predicted.head()

Unnamed: 0,country,region,ladder_score,logged_GPD_per_capita,social_support,healthy_life_expectancy,freedom_life_choices,generosity,perceptions_corruption,population_density,unemployment_rate,median_age,gini_coefficient,avg_temperature,lt_alcohol_per_capita,ladder_score_predicted
0,Finland,Europe,7.804,10.792,0.969,71.15,0.961,-0.019,0.182,16.6,7.16,43.2,27.7,3.24,8.23,6.542644
1,Denmark,Europe,7.586,10.962,0.954,71.25,0.934,0.134,0.196,138.0,5.14,42.2,27.7,9.77,9.16,6.558443
2,Iceland,Europe,7.53,10.896,0.983,72.05,0.936,0.211,0.668,3.5,3.56,37.8,26.1,2.11,7.72,6.684835
3,Israel,Middle East,7.473,10.639,0.943,72.697,0.809,-0.023,0.708,412.24,3.39,30.1,38.6,20.23,3.07,6.787054
4,Netherlands,Europe,7.403,10.942,0.93,71.55,0.887,0.213,0.379,420.38,3.56,42.2,29.2,11.72,8.23,6.60584


In [32]:
# Create a line plot of health versus the predicted ladder score
best_fit_line = df_healthy_life_predicted.hvplot.line(
    x = "healthy_life_expectancy",
    y = "ladder_score_predicted",
    color = "red"
)
best_fit_line

In [33]:
healthy_plot * best_fit_line

In [34]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [35]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.595724187609998.
The r2 is 0.595724187609998.
The mean squared error is 0.5343669792884174.
The root mean squared error is 0.7310040897891183.
The standard deviation is 1.1496904595668735.
