In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

In [2]:
#Read salary data
file_path = Path("../Resources/final_output.csv")
df_gini = pd.read_csv(file_path)

df_gini.head()

Unnamed: 0,country,region,ladder_score,logged_GPD_per_capita,social_support,healthy_life_expectancy,freedom_life_choices,generosity,perceptions_corruption,population_density,unemployment_rate,median_age,gini_coefficient,avg_temperature,lt_alcohol_per_capita
0,Finland,Europe,7.804,10.792,0.969,71.15,0.961,-0.019,0.182,16.6,7.16,43.2,27.7,3.24,8.23
1,Denmark,Europe,7.586,10.962,0.954,71.25,0.934,0.134,0.196,138.0,5.14,42.2,27.7,9.77,9.16
2,Iceland,Europe,7.53,10.896,0.983,72.05,0.936,0.211,0.668,3.5,3.56,37.8,26.1,2.11,7.72
3,Israel,Middle East,7.473,10.639,0.943,72.697,0.809,-0.023,0.708,412.24,3.39,30.1,38.6,20.23,3.07
4,Netherlands,Europe,7.403,10.942,0.93,71.55,0.887,0.213,0.379,420.38,3.56,42.2,29.2,11.72,8.23


In [3]:
# Create a scatter plot of years_experience versus the salary information
gini_plot = df_gini.hvplot.scatter(
    x="gini_coefficient",
    y="ladder_score",
    title="Ladder score by gini coefficient"
)
gini_plot

In [4]:
# Reformat data of the independent variable X as a single-column array
X = df_gini["gini_coefficient"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[27.7],
       [27.7],
       [26.1],
       [38.6],
       [29.2]])

In [5]:
# The shape of X is 30 samples, with a single feature (column)
X.shape

(123, 1)

In [6]:
# Create an array for the dependent variable y
y = df_gini["ladder_score"]

In [7]:
# Create a model with scikit-learn
model = LinearRegression()

In [8]:
# Fit the data into the model
model.fit(X, y)

In [9]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [-0.03871186]


In [10]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 7.003773194003292 + -0.03871186317687641X


In [11]:
# Display the formula to predict the salary for a person with 7 years of experience
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}")

# Predict the salary for a person with 7 years of experience
y_7 = model.intercept_ + model.coef_[0] 

# Display the prediction
print(f"{y_7:.2f}")

Model's formula: y = 7.003773194003292 + -0.03871186317687641
6.97


In [12]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [13]:
# Create a copy of the original data
df_ladder_predicted = df_gini.copy()

# Add a column with the predicted salary values
df_ladder_predicted["ladder_score_predicted"] = predicted_y_values

# Display sample data
df_ladder_predicted.head()

Unnamed: 0,country,region,ladder_score,logged_GPD_per_capita,social_support,healthy_life_expectancy,freedom_life_choices,generosity,perceptions_corruption,population_density,unemployment_rate,median_age,gini_coefficient,avg_temperature,lt_alcohol_per_capita,ladder_score_predicted
0,Finland,Europe,7.804,10.792,0.969,71.15,0.961,-0.019,0.182,16.6,7.16,43.2,27.7,3.24,8.23,5.931455
1,Denmark,Europe,7.586,10.962,0.954,71.25,0.934,0.134,0.196,138.0,5.14,42.2,27.7,9.77,9.16,5.931455
2,Iceland,Europe,7.53,10.896,0.983,72.05,0.936,0.211,0.668,3.5,3.56,37.8,26.1,2.11,7.72,5.993394
3,Israel,Middle East,7.473,10.639,0.943,72.697,0.809,-0.023,0.708,412.24,3.39,30.1,38.6,20.23,3.07,5.509495
4,Netherlands,Europe,7.403,10.942,0.93,71.55,0.887,0.213,0.379,420.38,3.56,42.2,29.2,11.72,8.23,5.873387


In [14]:
# Create a line plot of years_experience versus the predicted salary values
best_fit_line = df_ladder_predicted.hvplot.line(
    x = "gini_coefficient",
    y = "ladder_score_predicted",
    color = "red"
)
best_fit_line

In [15]:
gini_plot * best_fit_line

In [16]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.06659139870513531.
The r2 is 0.06659139870513531.
The mean squared error is 1.2337684309309889.
The root mean squared error is 1.110751291212839.
The standard deviation is 1.1496904595668735.
