In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('../data/Advertising.csv', index_col=0)
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [2]:
number_of_samples, number_of_features = (
    df.shape[0], # 200 samples
    df.shape[1] - 1, # 3 features
)  # -1 because Sales is the label/target not a feature
print("Number of samples: ", number_of_samples)
print("Number of features: ", number_of_features)
number_of_samples, number_of_features


Number of samples:  200
Number of features:  3


(200, 3)

In [3]:
X, y = df.drop("Sales", axis="columns"), df["Sales"]
X.head()

Unnamed: 0,TV,Radio,Newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [4]:
y.head() # this is the target/label

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: Sales, dtype: float64

## Sklearn - typical workflow/steps

1. train/test split, sometimes train|validation|test split
2. scaling? sometimes required
    - min-max scaling
    - standardization
    - ...
    - scale the training data
    - scale the test data to the training data --> to avoid data leakage
3. fit algorithm to training data - model training
4. predict test data - (model testing)
5. evaluate

## Train/test split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((140, 3), (60, 3), (140,), (60,))

## Feature scaling

Normalization: min-max feature scaling
- $X' = \frac{X - X_{min}}{X_{max} - X_{min}}$

Feature standardization
- $X' = \frac{X - \mu}{\sigma}$

In [13]:
from sklearn.preprocessing import MinMaxScaler

# instantiate a scaler instance
scaler = MinMaxScaler()
scaler.fit(X_train) # important - use this for training data only

scaled_X_train = scaler.transform(X_train) # normalise the training data
scaled_X_test = scaler.transform(X_test) # normalise the test data

print(f'{scaled_X_train.min()=}')
print(f'{scaled_X_train.max()=}')
print(f'{scaled_X_test.min()=}')
print(f'{scaled_X_test.max()=}')
# note: scaled_X_train.min() != 0, scaled_X_train.max() != 1
# 0 <= scaled_X_train <= 1
# 0.005964214711729622 <= scaled_X_test <= 1.1302186878727631

scaled_X_train.min()=0.0
scaled_X_train.max()=1.0
scaled_X_test.min()=0.005964214711729622
scaled_X_test.max()=1.1302186878727631


In [12]:
scaled_X_train.shape, scaled_X_test.shape

((140, 3), (60, 3))

## Linear regression
### Ordinary least squares (OLS)

In [17]:
from sklearn.linear_model import LinearRegression

model_OLS = LinearRegression()
model_OLS.fit(scaled_X_train, y_train) # fit the model to the training data
print(f'Parameters: {model_OLS.coef_}') # beta_1, beta_2, beta_3
print(f'Intercept: {model_OLS.intercept_}') # beta_0

Parameters: [13.02832938  9.88465985  0.69237469]
Intercept: 2.7418553248528124


### Stochastic gradient descent (SGD)

In [23]:
from sklearn.linear_model import SGDRegressor

model_SGD = SGDRegressor(loss="squared_error", max_iter=10000)
model_SGD.fit(scaled_X_train, y_train) # fit the model to the training data
print(f'Parameters: {model_SGD.coef_}') # beta_1, beta_2, beta_3
print(f'Intercept: {model_SGD.intercept_}') # beta_0

Parameters: [11.97959578  9.01257134  1.35155857]
Intercept: [3.56240906]


## Manual prediction

In [27]:
test_sample_features = scaled_X_test[0].reshape(1, -1)
test_sample_label = y_test.values[0]
test_sample_features, test_sample_label
#(array([0.54988164, 0.63709677, 0.52286282]), 16.9) # (TV, Radio, Newspaper), Sales

(array([[0.54988164, 0.63709677, 0.52286282]]), 16.9)

In [28]:
model_OLS.predict(test_sample_features)[0] 

16.56539629743484

In [29]:
model_SGD.predict(test_sample_features)[0]

16.598328654249208

## Model evaluation

In [35]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 1. predict on test data
y_pred_OLS = model_OLS.predict(scaled_X_test)
y_pred_SGD = model_SGD.predict(scaled_X_test)
y_pred_OLS[:5]

(array([16.5653963 , 21.18822792, 21.55107058, 10.88923816, 22.20231988]),
 array([16.59832865, 20.82423687, 21.11882437, 11.31619581, 21.40150057]))

In [36]:
y_pred_SGD[:5]

array([16.59832865, 20.82423687, 21.11882437, 11.31619581, 21.40150057])

In [38]:
y_test[:5].values

array([16.9, 22.4, 21.4,  7.3, 24.7])

In [41]:
mae_OLS = mean_absolute_error(y_test, y_pred_OLS)
mae_SGD = mean_absolute_error(y_test, y_pred_SGD)

mse_OlS = mean_squared_error(y_test, y_pred_OLS)
mse_SGD = mean_squared_error(y_test, y_pred_SGD)

rmse_OLS = np.sqrt(mse_OlS)
rmse_SGD = np.sqrt(mse_SGD)

print(f'{mae_OLS=:.4f} \t\t {mse_OlS=:.4f} \t {rmse_OLS=:.4f}')
print(f'{mae_SGD=:.4f} \t\t {mse_SGD=:.4f} \t {rmse_SGD=:.4f}')

# we notice that the OLS model gives smaller values then the SGD model, which means that the OLS model is better

mae_OLS=1.5117 		 mse_OlS=3.7968 	 rmse_OLS=1.9485
mae_SGD=1.5222 		 mse_SGD=4.0867 	 rmse_SGD=2.0216
