# [Sci kit learn](https://github.com/kokchun/Maskininlarning-AI21/blob/main/Lectures/L2-scikit-learn.ipynb)


In [2]:
import pandas
import seaborn
import matplotlib.pyplot as pyplot
import numpy


In [3]:
advertising_raw = pandas.read_csv("../data/Advertising.csv", index_col=0)

print(f"{advertising_raw.shape[0]} samples")
print(
    f"{advertising_raw.shape[1]-1} features"
)  # subtract one as sales is the label and not a feature

advertising_raw.head()


200 samples
3 features


Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [4]:
X, y = advertising_raw.drop("sales", axis="columns"), advertising_raw["sales"]
X.head(2), y.head(2)


(      TV  radio  newspaper
 1  230.1   37.8       69.2
 2   44.5   39.3       45.1,
 1    22.1
 2    10.4
 Name: sales, dtype: float64)

## Scikit-learn steps

1. train|test split - some cases train|validation|test - split
2. Scale the dataset
   - many algorithms require scaling, some don't
   - which type of scaling method to use?
   - scale training data using training data, scale test data using training data, to avoid data leakage
3. Fit the algorithm to the training data
4. Manual test prediction
5. Evaluation metrics on test data


### 1. Train|test split


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((140, 3), (60, 3), (140,), (60,))

### 2. Feature scaling


In [9]:
print(f"{X_train.min()} ≤ X_train ≤ {X_train.max()}")
print(f"{X_test.min()} ≤ X_test ≤ {X_test.max()}")


TV           0.7
radio        0.0
newspaper    0.3
dtype: float64 ≤ X_train ≤ TV           296.4
radio         49.6
newspaper    100.9
dtype: float64
TV           5.4
radio        0.8
newspaper    0.9
dtype: float64 ≤ X_test ≤ TV           292.9
radio         49.4
newspaper    114.0
dtype: float64


In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)  # use the training data to fit the scaler

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(
    f"scaled_X_train min value: {scaled_X_train.min():.2f}\nscaled_X_train max value: {scaled_X_train.max():.2f}"
)
print(
    f"scaled_X_test min value: {scaled_X_test.min():.2f}\nscaled_X_test max value: {scaled_X_test.max():.2f}"
)  # natural that it isn't [0,1] since we fit to training data
# we do not scale our target variable y in this lecture


scaled_X_train min value: 0.00
scaled_X_train max value: 1.00
scaled_X_test min value: 0.01
scaled_X_test max value: 1.13


### 3. Linear regression algorithms


In [18]:
from sklearn.linear_model import LinearRegression

# SVD = Singular Value Decomposition that is used for calculating pseudo inverse in OLS normal equation
model_SVD = LinearRegression()
model_SVD.fit(scaled_X_train, y_train)

# Weights and intercept
print(f"Parameter weights: {model_SVD.coef_}")
print(f"Intercept: {model_SVD.intercept_}")


Parameter weights: [13.02832938  9.88465985  0.69237469]
Intercept: 2.7418553248528124


In [26]:
from sklearn.linear_model import SGDRegressor

model_SGD = SGDRegressor(loss="squared_error", learning_rate="invscaling", max_iter=100000)
model_SGD.fit(scaled_X_train, y_train) # Note that SGD requires features to me scaled

print(f"Parameter weights: {model_SGD.coef_}")
print(f"Intercept: {model_SGD.intercept_}")

Parameter weights: [11.97478553  9.01044656  1.3349425 ]
Intercept: [3.56795757]


### 4. Manual test prediction

In [27]:
scaled_X_test[0]

array([0.54988164, 0.63709677, 0.52286282])

In [28]:
X_test.iloc[0]

TV           163.3
radio         31.6
newspaper     52.9
Name: 96, dtype: float64

In [33]:
scaled_X_test[0].shape, scaled_X_test[0].reshape(1,-1).shape

((3,), (1, 3))

In [31]:
test_sample_features = scaled_X_test[0].reshape(1,-1)
test_sample_target = y_test.values[0]

# model_XXX.predict() uses weights and intercept from earlier model_XXX.fit()
model_SGD.predict(test_sample_features)[0], model_SVD.predict(test_sample_features)[0], test_sample_target 

(16.591190477964474, 16.56539629743484, 16.9)

5. Evaluation metrics on test data

In [37]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

y_pred_SVD = model_SVD.predict(scaled_X_test)
y_pred_SGD = model_SGD.predict(scaled_X_test)

mae_SVD = mean_absolute_error(y_test, y_pred_SVD)
mae_SGD = mean_absolute_error(y_test, y_pred_SGD)

mse_SVD = mean_squared_error(y_test, y_pred_SVD)
mse_SGD = mean_squared_error(y_test, y_pred_SGD)

rmse_SVD = numpy.sqrt(mse_SVD)
rmse_SGD = numpy.sqrt(mse_SGD)

print(f'SVD MAE: {mae_SVD:.2f}, MSE: {mse_SVD:.2f}, RMSE: {rmse_SVD:.2f}')
print(f'SGD MAE: {mae_SGD:.2f}, MSE: {mse_SGD:.2f}, RMSE: {rmse_SGD:.2f}')

SVD MAE: 1.51, MSE: 3.80, RMSE: 1.95
SGD MAE: 1.52, MSE: 4.08, RMSE: 2.02
