In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

In [2]:
# Create SQLite connection
happiness_path = Path('../Resources/HappinessIndexScore.sqlite')
engine = create_engine(f'sqlite:///{happiness_path}')

In [3]:
# Create df
df_alc_consumption = pd.read_sql('SELECT * FROM final_output', con=engine)
df_alc_consumption.head()

Unnamed: 0,country,region,ladder_score,logged_GPD_per_capita,social_support,healthy_life_expectancy,freedom_life_choices,generosity,perceptions_corruption,population_density,unemployment_rate,median_age,gini_coefficient,avg_temperature,lt_alcohol_per_capita
0,Finland,Europe,7.804,10.792,0.969,71.15,0.961,-0.019,0.182,16.6,7.16,43.2,27.7,3.24,8.23
1,Denmark,Europe,7.586,10.962,0.954,71.25,0.934,0.134,0.196,138.0,5.14,42.2,27.7,9.77,9.16
2,Iceland,Europe,7.53,10.896,0.983,72.05,0.936,0.211,0.668,3.5,3.56,37.8,26.1,2.11,7.72
3,Israel,Middle East,7.473,10.639,0.943,72.697,0.809,-0.023,0.708,412.24,3.39,30.1,38.6,20.23,3.07
4,Netherlands,Europe,7.403,10.942,0.93,71.55,0.887,0.213,0.379,420.38,3.56,42.2,29.2,11.72,8.23


In [4]:
# Create a scatter plot of ladder score versus the alcohol information
alc_plot = df_alc_consumption.hvplot.scatter(
    x="lt_alcohol_per_capita",
    y="ladder_score",
    title="Ladder Score vs. Alcohol"
)
alc_plot

In [5]:
# Reformat data of the independent variable X as a single-column array
X = df_alc_consumption["lt_alcohol_per_capita"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[8.23],
       [9.16],
       [7.72],
       [3.07],
       [8.23]])

In [6]:
# The shape of X is 30 samples, with a single feature (column)
X.shape

(123, 1)

In [7]:
# Create an array for the dependent variable y
y = df_alc_consumption["ladder_score"]

In [8]:
# Create a model with scikit-learn
model = LinearRegression()

In [9]:
# Fit the data into the model
model.fit(X, y)

In [10]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [0.18805635]


In [11]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 4.601169265206735


In [12]:

print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}")

y_7 = model.intercept_ + model.coef_[0] 

# Display the prediction
print(f"15L of alcohol = happiness score of: {y_7:.2f}")


Model's formula: y = 4.601169265206735 + 0.18805635066058934
15L of alcohol = happiness score of: 4.79


In [13]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [14]:
# Create a copy of the original data
df_alc_predicted = df_alc_consumption.copy()

# Add a column with the predicted ladder score values
df_alc_consumption["ladder_score_predicted"] = predicted_y_values

# Display sample data
df_alc_consumption.head()

Unnamed: 0,country,region,ladder_score,logged_GPD_per_capita,social_support,healthy_life_expectancy,freedom_life_choices,generosity,perceptions_corruption,population_density,unemployment_rate,median_age,gini_coefficient,avg_temperature,lt_alcohol_per_capita,ladder_score_predicted
0,Finland,Europe,7.804,10.792,0.969,71.15,0.961,-0.019,0.182,16.6,7.16,43.2,27.7,3.24,8.23,6.148873
1,Denmark,Europe,7.586,10.962,0.954,71.25,0.934,0.134,0.196,138.0,5.14,42.2,27.7,9.77,9.16,6.323765
2,Iceland,Europe,7.53,10.896,0.983,72.05,0.936,0.211,0.668,3.5,3.56,37.8,26.1,2.11,7.72,6.052964
3,Israel,Middle East,7.473,10.639,0.943,72.697,0.809,-0.023,0.708,412.24,3.39,30.1,38.6,20.23,3.07,5.178502
4,Netherlands,Europe,7.403,10.942,0.93,71.55,0.887,0.213,0.379,420.38,3.56,42.2,29.2,11.72,8.23,6.148873


In [15]:
# Create a line plot of alcohol versus the predicted ladder score
best_fit_line = df_alc_consumption.hvplot.line(
    x = "lt_alcohol_per_capita",
    y = "ladder_score_predicted",
    color = "red"
)
best_fit_line

In [16]:
# Superpose the original data and the best fit line
alc_plot * best_fit_line

In [17]:
from sklearn.metrics import mean_squared_error, r2_score

In [18]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.36501717262851274.
The r2 is 0.36501717262851274.
The mean squared error is 0.8393127784632006.
The root mean squared error is 0.9161401521946304.
The standard deviation is 1.1496904595668735.
