In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

LASSOData = pd.read_csv('Data/LASSO-data-set.csv')
LASSOData.info()


In [None]:
# seperate the dependent variable out
X = LASSOData.drop('score', axis=1)
print(X.info())

y = LASSOData['score']
print(y.head())


In [None]:
# Create testing and training sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)


Do a Lasso Cross Validation model

In [None]:
# Train the model
from sklearn.linear_model import LassoCV
reg = LassoCV().fit(X_train, y_train)


In [None]:
predictions_test_lasso = reg.predict(X_test)
residuals_test_lasso = predictions_test_lasso - y_test


In [None]:
import matplotlib.pyplot as plt

sns.set_style('whitegrid')
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], transform=ax.transAxes, color='red')
ax.scatter(x=predictions_test_lasso, y=y_test, )
""" ax.set_xlim(18, 37)
ax.set_ylim(18, 37) """
ax.set_xlabel('Predicted Values for Score')
ax.set_ylabel('Actual Values for Score')
plt.show()


In [None]:
# Display the residuals versus the actual values
sns.set_style('whitegrid')
fig, ax = plt.subplots()
ax.scatter(x=y_test, y=residuals_test_lasso)
ax.hlines(0, 0, 800, color='red')
""" ax.set_xlim(18, 37)
ax.set_ylim(18, 37) """
ax.set_xlabel('Actual Values for Score')
ax.set_ylabel('Residuals')
plt.show()


In [None]:
# Check the model fit
from sklearn.metrics import median_absolute_error
score_lasso = reg.score(X_test, y_test)
MAE_lasso = median_absolute_error(y_test, predictions_test_lasso)
print(f"R2 = {round(score_lasso,4)}")
print(
    f"Median Abs Error = {round(MAE_lasso,4)}")


In [None]:
# Get the model coefficients
coeffs_lasso = reg.coef_
print(len(coeffs_lasso))
columns = X_train.columns
string = f"y = {round(reg.intercept_,4)}"
for idx, coeff in enumerate(coeffs_lasso):
    string += f" + {round(coeff,4)}*{columns[idx]}"
print(string)
print(f"There are {(coeffs_lasso == 0).sum()} coefficients that equal zero")


Do a regular linear regression

In [None]:
# Train the model
from sklearn.linear_model import LinearRegression
reglr = LinearRegression().fit(X_train, y_train)
predictions_test_lr = reglr.predict(X_test)
residuals_test_lr = predictions_test_lr - y_test


In [None]:
import matplotlib.pyplot as plt

sns.set_style('whitegrid')
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], transform=ax.transAxes, color='red')
ax.scatter(x=predictions_test_lr, y=y_test, )
""" ax.set_xlim(18, 37)
ax.set_ylim(18, 37) """
ax.set_xlabel('Predicted Values for Score')
ax.set_ylabel('Actual Values for Score')
plt.show()


In [None]:
# Display the residuals versus the actual values
sns.set_style('whitegrid')
fig, ax = plt.subplots()
ax.scatter(x=y_test, y=residuals_test_lr)
ax.hlines(0, 0, 800, color='red')
""" ax.set_xlim(18, 37)
ax.set_ylim(18, 37) """
ax.set_xlabel('Actual Values for Score')
ax.set_ylabel('Residuals')
plt.show()


In [None]:
# Check the model fit
from sklearn.metrics import median_absolute_error
score_lr = reglr.score(X_test, y_test)
MAE_lr = median_absolute_error(y_test, predictions_test_lr)
print(f"R2 = {round(score_lr,4)}")
print(
    f"Median Abs Error = {round(MAE_lr,4)}")


In [None]:
# Get the model coefficients
coeffs_lr = reglr.coef_
print(len(coeffs_lr))
columns = X_train.columns
string = f"y = {round(reglr.intercept_,4)}"
for idx, coeff in enumerate(coeffs_lr):
    string += f" + {round(coeff,4)}*{columns[idx]}"
print(string)
print(f"There are {(coeffs_lr == 0).sum()} coefficients that equal zero")


The models were remarkably similar in most of the metrics. This is probably because the linear regression model fits so closely already. The main difference is that the LASSO CV model removed 8 or the 16 coefficients while the linear regression model used all 16.