# Multi Linear Regression - Sales Prediction Example

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score

* Set Option

In [2]:
pd.set_option('display.max_columns', None) # Show All Columns
pd.set_option('display.max_rows', None) # Show All Rows
pd.set_option('display.float_format', lambda x: '%.3f' % x) # After Comma 3 Digit
pd.set_option('display.width', 500) # Show 500 Width

* Load Data

In [3]:
df = pd.read_csv("data/advertising.csv")

* Discover Data

In [4]:
df.shape

(200, 4)

In [5]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [6]:
df.dtypes

TV           float64
radio        float64
newspaper    float64
sales        float64
dtype: object

In [7]:
df.nunique().sort_values(ascending=False)

TV           190
newspaper    172
radio        167
sales        121
dtype: int64

In [8]:
df["sales"].value_counts()

sales
9.700     5
11.700    4
12.900    4
15.900    4
20.700    3
25.400    3
15.500    3
18.000    3
11.900    3
8.700     3
9.500     3
11.600    3
10.600    3
12.200    3
13.200    3
11.800    3
10.100    3
13.400    3
11.400    2
22.600    2
20.200    2
23.800    2
10.300    2
14.800    2
10.900    2
14.900    2
10.800    2
10.400    2
17.100    2
11.000    2
19.600    2
5.300     2
15.200    2
6.600     2
7.300     2
11.500    2
12.600    2
8.800     2
19.200    2
16.600    2
10.500    2
9.300     2
7.200     2
17.400    2
17.300    2
19.000    2
12.500    2
11.300    2
12.000    2
15.000    2
18.900    2
14.600    2
7.600     2
14.700    2
12.800    2
9.600     2
15.600    1
6.700     1
13.300    1
7.000     1
14.500    1
9.900     1
9.400     1
24.700    1
8.000     1
5.900     1
21.800    1
14.100    1
19.700    1
17.600    1
1.600     1
16.100    1
12.700    1
5.700     1
26.200    1
14.400    1
8.400     1
27.000    1
20.800    1
15.300    1
3.200     1
20.100    1
22.100    

In [9]:
df.isnull().sum().sort_values(ascending=False)

TV           0
radio        0
newspaper    0
sales        0
dtype: int64

In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TV,200.0,147.042,85.854,0.7,74.375,149.75,218.825,296.4
radio,200.0,23.264,14.847,0.0,9.975,22.9,36.525,49.6
newspaper,200.0,30.554,21.779,0.3,12.75,25.75,45.1,114.0
sales,200.0,14.023,5.217,1.6,10.375,12.9,17.4,27.0


In [11]:
df.quantile([0, 0.05, 0.50, 0.90, 0.99, 1]).T

Unnamed: 0,0.000,0.050,0.500,0.900,0.990,1.000
TV,0.7,13.195,149.75,261.44,292.907,296.4
radio,0.0,1.995,22.9,43.52,49.4,49.6
newspaper,0.3,3.6,25.75,59.07,89.515,114.0
sales,1.6,6.6,12.9,21.71,25.507,27.0


In [12]:
corr = df.drop("sales",axis=1).corr().abs()
corr_values = corr.unstack()
corr_values_sort = corr_values.sort_values(kind='quicksort',ascending=False)
corr_values_sort[corr_values_sort>0.1]

TV         TV          1.000
radio      radio       1.000
newspaper  newspaper   1.000
radio      newspaper   0.354
newspaper  radio       0.354
dtype: float64

* Split Data For Basic Linear Regression

In [13]:
X = df.drop('sales', axis=1)
y = df[["sales"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

* Model Train For Basic Linear Regression

In [14]:
reg_model = LinearRegression().fit(X_train, y_train)

# (b - bias)
print("Bias Value: ",reg_model.intercept_)

# coefficients (w - weights)
print("Coefficients Value: ",reg_model.coef_)

Bias Value:  [2.90794702]
Coefficients Value:  [[0.0468431  0.17854434 0.00258619]]


* Prediction

In [15]:
# Example

# TV: 30
# radio: 10
# newspaper: 40

# 2.90 - Bias
# 0.0468431 , 0.17854434, 0.00258619 -> weights

# Sales = 2.90  + TV * 0.04 + radio * 0.17 + newspaper * 0.002

2.90794702 + 30 * 0.0468431 + 10 * 0.17854434 + 40 * 0.00258619

# Prediction with our example values
yeni_veri = [[30], [10], [40]]
yeni_veri = pd.DataFrame(yeni_veri).T

reg_model.predict(yeni_veri)



array([[6.202131]])

* Model Success

In [17]:
# Train RMSE
y_pred = reg_model.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred))
# 1.73

1.736902590147092

In [18]:
# TRAIN RKARE
reg_model.score(X_train, y_train)

0.8959372632325174

In [19]:
# Test RMSE
y_pred = reg_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))
# 1.41

1.4113417558581587

In [20]:
# Test RKARE
# 0.89 : This means that our independent variables can explain 90% of our dependent variable.
reg_model.score(X_test, y_test)

0.8927605914615384

In [21]:
# 10 Fold CV RMSE
np.mean(np.sqrt(-cross_val_score(reg_model,
                                 X,
                                 y,
                                 cv=10,
                                 scoring="neg_mean_squared_error")))
# 1.69

1.6913531708051797

In [22]:
# 5 Fold CV RMSE
np.mean(np.sqrt(-cross_val_score(reg_model,
                                 X,
                                 y,
                                 cv=5,
                                 scoring="neg_mean_squared_error")))
# 1.71


1.7175247278732084