In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import kagglehub
from kagglehub import KaggleDatasetAdapter

import scipy as sp


from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# These under here are used to deploy the model. So one can take these now and add new numbers as with "campaign" and get the prediction
from joblib import dump, load

In [None]:
file_path = "advertising.csv"

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "ashydv/advertising-dataset",
  file_path
)

df.head(5)

In [None]:
# X is everything that is not the label column. Label column would be what we are trying to predict
X = df.drop("Sales", axis=1)

# Small y because it is just one column
y = df["Sales"]




In [None]:
polynomial_converter = PolynomialFeatures(degree= 2, include_bias= False)

In [None]:
poly_features = polynomial_converter.fit_transform(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state=42)

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
test_predictions = model.predict(X_test)

In [None]:
MAE = mean_absolute_error(y_test, test_predictions )

In [None]:

MSE = mean_squared_error(y_test, test_predictions)

In [None]:
#RMSE
RMSE = np.sqrt(MSE)

In [None]:
print(MAE, MSE, RMSE)

In [None]:
test_residuals = y_test - test_predictions
sns.scatterplot(x=y_test, y=test_residuals)
plt.axhline(y=0, color="red", ls="--" )

In [None]:
sns.displot(test_residuals, bins= 25, kde= True)

In [None]:
# Figure and axis to plot on
fig, ax = plt.subplots(figsize =(6,8), dpi= 100)
# probplot returns the raw values if needed, 
# we just want to see the plot, so we assign these values to _
_ = sp.stats.probplot(test_residuals, plot = ax)



In [None]:
# TRAINING ERROR PER DEGREE
train_rmse_errors = []
# TEST ERROR PER DEGREE
test_rmse_errors = []

for d in range(1,10):
    
    # CREATE POLY DATA SET FOR DEGREE "d"
    polynomial_converter = PolynomialFeatures(degree=d,include_bias=False)
    poly_features = polynomial_converter.fit_transform(X)
    
    # SPLIT THIS NEW POLY DATA SET
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state=101)
    
    # TRAIN ON THIS NEW POLY SET
    model = LinearRegression(fit_intercept=True)
    model.fit(X_train,y_train)
    
    # PREDICT ON BOTH TRAIN AND TEST
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    # Calculate Errors
    
    # Errors on Train Set
    train_RMSE = np.sqrt(mean_squared_error(y_train,train_pred))
    
    # Errors on Test Set
    test_RMSE = np.sqrt(mean_squared_error(y_test,test_pred))

    # Append errors to lists for plotting later
    
   
    train_rmse_errors.append(train_RMSE)
    test_rmse_errors.append(test_RMSE)

In [None]:
plt.plot(range(1,6),train_rmse_errors[:5],label='TRAIN')
plt.plot(range(1,6),test_rmse_errors[:5],label='TEST')
plt.xlabel("Polynomial Complexity")
plt.ylabel("RMSE")
plt.legend()

In [None]:
# Based on our chart, could have also been degree=4, but 
# it is better to be on the safe side of complexity

# So Here we set the degrees at 3 after gathering the information as we did above here.

final_poly_converter = PolynomialFeatures(degree=3,include_bias=False)

In [None]:
final_model = LinearRegression()

In [None]:
full_converted_X = final_poly_converter.fit_transform(X)
final_model.fit(full_converted_X,y)

In [None]:
dump(final_model, '../models/sales_poly_model.joblib') 

In [None]:
dump(final_poly_converter,'../models/poly_converter.joblib')

In [None]:
loaded_poly = load('../models/poly_converter.joblib')
loaded_model = load('../models/sales_poly_model.joblib')

In [None]:
campaign = [[149,22,12]]

In [None]:
campaign_poly = loaded_poly.transform(campaign)

final_model.predict(campaign_poly)