In [16]:
import pandas as pd

df = pd.read_csv("advertising.csv", index_col=0)
df

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9
...,...,...,...,...
196,38.2,3.7,13.8,7.6
197,94.2,4.9,8.1,9.7
198,177.0,9.3,6.4,12.8
199,283.6,42.0,66.2,25.5


In [17]:
# 200 samples
# 3 feauters
# 1 lable
df.shape

(200, 4)

In [18]:
x, y = df.drop("sales", axis = 1), df["sales"]

x.head()

Unnamed: 0,TV,radio,newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [19]:
y .head()

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: sales, dtype: float64

## Train|test split

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(

    x, y, test_size=0.33, random_state=42
)

X_train

Unnamed: 0,TV,radio,newspaper
43,293.6,27.7,1.8
190,18.7,12.1,23.4
91,134.3,4.9,9.3
137,25.6,39.0,9.3
52,100.4,9.6,3.6
...,...,...,...
107,25.0,11.0,29.7
15,204.1,32.9,46.0
93,217.7,33.5,59.0
180,165.6,10.0,17.6


In [21]:
X_test.shape

(66, 3)

In [22]:
y_train.shape, y_test.shape

((134,), (66,))

## Featrue scaling

In [23]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

scaler

0,1,2
,"feature_range  feature_range: tuple (min, max), default=(0, 1) Desired range of transformed data.","(0, ...)"
,"copy  copy: bool, default=True Set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array).",True
,"clip  clip: bool, default=False Set to True to clip transformed values of held-out data to provided `feature_range`. Since this parameter will clip values, `inverse_transform` may not be able to restore the original data. .. note::  Setting `clip=True` does not prevent feature drift (a distribution  shift between training and test data). The transformed values are clipped  to the `feature_range`, which helps avoid unintended behavior in models  sensitive to out-of-range inputs (e.g. linear models). Use with care,  as clipping can distort the distribution of test data. .. versionadded:: 0.24",False


In [24]:
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

scaled_X_train.shape, scaled_X_test.shape

((134, 3), (66, 3))

In [25]:
scaled_X_train.min(), scaled_X_train.max()

(np.float64(0.0), np.float64(1.0))

In [26]:
scaled_X_test.min(), scaled_X_test.max()

(np.float64(0.005964214711729622), np.float64(1.1302186878727631))

## Linear regression

In [27]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [28]:
model.fit(scaled_X_train, y_train)
model.coef_

array([13.20747617,  9.75285112,  0.61108329])

In [29]:
model.intercept_

np.float64(2.79115951962436)

## Prediction

In [31]:
test_sample_feautures = scaled_X_test[0].reshape(1,-1)
test_sample_target = y_test.values[0]
test_sample_feautures, test_sample_target

(array([[0.54988164, 0.63709677, 0.52286282]]), np.float64(16.9))

In [32]:
test_sample_feautures.shape

(1, 3)

In [33]:
model.predict(test_sample_feautures)

array([16.58673085])

In [34]:
test_sample_target

np.float64(16.9)

## Prediction on test data


In [36]:
y_pred = model.predict(scaled_X_test)
y_pred.shape

(66,)

In [37]:
y_test.shape

(66,)

In [38]:
y_test[:5].values

array([16.9, 22.4, 21.4,  7.3, 24.7])

In [39]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np 

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"{mae = :.2f}")
print(f"{mse = :.2f}")
print(f"{rmse = :.2f}")

mae = 1.49
mse = 3.73
rmse = 1.93
