In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

## Data Loading and Visualization

In [2]:
# Read salary data
file_path = Path("Resources/Output/collated_data.csv")
df = pd.read_csv(file_path)

# Display sample data
df.head()

Unnamed: 0,period,laid_off_by_month,funds_raised_by_month,mortgage_rate,fed_interest_rate,unemployment_rate,stock_open,stock_high,stock_low,stock_close,stock_adj_close,stock_volume
0,2020/03,7850.0,15530.2,3.45,0.65,4.4,2974.28,3136.72,2191.86,2584.59,2584.59,162185400000.0
1,2020/04,19821.0,43862.0,3.306,0.05,14.8,2498.08,2954.86,2447.49,2912.43,2912.43,123608200000.0
2,2020/05,14674.0,74191.0,3.2325,0.05,13.2,2869.09,3068.67,2766.64,3044.31,3044.31,107135200000.0
3,2020/06,3926.0,11724.1,3.1625,0.08,11.0,3038.78,3233.13,2965.66,3100.29,3100.29,131458900000.0
4,2020/07,1612.0,4447.0,3.016,0.09,10.2,3105.92,3279.99,3101.17,3271.12,3271.12,96928130000.0


In [3]:
# Create a scatter plot of years_experience versus the salary information
salary_plot = df.hvplot.scatter(
    x="laid_off_by_month",
    y="stock_low",
    title="Expected Layoffs by stock low"
)
salary_plot

## Data Preparation

In [4]:
# Reformat data of the independent variable X as a single-column array
X = df["laid_off_by_month"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[ 7850.],
       [19821.],
       [14674.],
       [ 3926.],
       [ 1612.]])

In [5]:
# The shape of X is 30 samples, with a single feature (column)
X.shape

(52, 1)

In [6]:
# Create an array for the dependent variable y
y = df["stock_low"]

## Building the Linear Regression Model

In [7]:
# Create a model with scikit-learn
model = LinearRegression()

In [8]:
# Fit the data into the model
model.fit(X, y)

In [9]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [-0.00180613]


In [10]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 4007.6439900599667


In [11]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 4007.6439900599667 + -0.0018061258125378705X


In [12]:
# Display the formula to predict the salary for a person with 7 years of experience
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 7")

# Predict the salary for a person with 7 years of experience
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted salary for a person with 7 years of experience: ${y_7:.2f}")

Model's formula: y = 4007.6439900599667 + -0.0018061258125378705 * 7
Predicted salary for a person with 7 years of experience: $4007.63


In [13]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [14]:
# Create a copy of the original data
df_predicted = df.copy()

# Add a column with the predicted salary values
df_predicted["layoffs_predicted"] = predicted_y_values

# Display sample data
df_predicted.head()

Unnamed: 0,period,laid_off_by_month,funds_raised_by_month,mortgage_rate,fed_interest_rate,unemployment_rate,stock_open,stock_high,stock_low,stock_close,stock_adj_close,stock_volume,layoffs_predicted
0,2020/03,7850.0,15530.2,3.45,0.65,4.4,2974.28,3136.72,2191.86,2584.59,2584.59,162185400000.0,3993.465902
1,2020/04,19821.0,43862.0,3.306,0.05,14.8,2498.08,2954.86,2447.49,2912.43,2912.43,123608200000.0,3971.84477
2,2020/05,14674.0,74191.0,3.2325,0.05,13.2,2869.09,3068.67,2766.64,3044.31,3044.31,107135200000.0,3981.1409
3,2020/06,3926.0,11724.1,3.1625,0.08,11.0,3038.78,3233.13,2965.66,3100.29,3100.29,131458900000.0,4000.55314
4,2020/07,1612.0,4447.0,3.016,0.09,10.2,3105.92,3279.99,3101.17,3271.12,3271.12,96928130000.0,4004.732515


In [15]:
# Create a line plot of years_experience versus the predicted salary values
best_fit_line = df_predicted.hvplot.line(
    x = "laid_off_by_month",
    y = "layoffs_predicted",
    color = "red"
)
best_fit_line

In [16]:
# Superpose the original data and the best fit line
salary_plot * best_fit_line

## Linear Regression Model Assessment

In [17]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [18]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.0010838489804152474.
The r2 is 0.0010838489804152474.
The mean squared error is 447101.95523865835.
The root mean squared error is 668.6568292021389.
The standard deviation is 669.0194855385944.
