In [1]:
import pandas as pd

In [2]:
import statsmodels.api as sm
dataset = pd.read_csv('Advertising.csv', index_col=0)

dataset

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9
...,...,...,...,...
196,38.2,3.7,13.8,7.6
197,94.2,4.9,8.1,9.7
198,177.0,9.3,6.4,12.8
199,283.6,42.0,66.2,25.5


Note: In 'Advertising.csv file index has been given, hence we used "index_col=0" here.

In [3]:
X = dataset[['TV','radio','newspaper']]    # OR X = dataset.iloc[:,:-1]
print(X)
y = dataset['sales'] # OR y = dataset.iloc[:,3]
y

        TV  radio  newspaper
1    230.1   37.8       69.2
2     44.5   39.3       45.1
3     17.2   45.9       69.3
4    151.5   41.3       58.5
5    180.8   10.8       58.4
..     ...    ...        ...
196   38.2    3.7       13.8
197   94.2    4.9        8.1
198  177.0    9.3        6.4
199  283.6   42.0       66.2
200  232.1    8.6        8.7

[200 rows x 3 columns]


1      22.1
2      10.4
3       9.3
4      18.5
5      12.9
       ... 
196     7.6
197     9.7
198    12.8
199    25.5
200    13.4
Name: sales, Length: 200, dtype: float64

Note:
In multiple linear reg, the equation is
y = β0 + β1X1 + β2X2 + β3X3....

we are using Ordinary Least Square Menthod, here we need to compute 'β0' as well,                                      
So we will treat 'β0' as 'β0 X 1', so that "1" also act as a feature

We will explicitely add column '1' in the dataset for the same purpose

This library  "statsmodels.api" is used for the same purpose

In [4]:
X = sm.add_constant(X)

In [5]:
X

Unnamed: 0,const,TV,radio,newspaper
1,1.0,230.1,37.8,69.2
2,1.0,44.5,39.3,45.1
3,1.0,17.2,45.9,69.3
4,1.0,151.5,41.3,58.5
5,1.0,180.8,10.8,58.4
...,...,...,...,...
196,1.0,38.2,3.7,13.8
197,1.0,94.2,4.9,8.1
198,1.0,177.0,9.3,6.4
199,1.0,283.6,42.0,66.2


In [6]:
## fit a OLS (Ordinary Least Square) with intercept on TV and Radio
model = sm.OLS(y,X).fit()

In [7]:
model.summary()

# Very important info can be obtained by this

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Tue, 03 Aug 2021",Prob (F-statistic):,1.58e-96
Time:,00:06:00,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


# # Checking for Multicollinearity

In [8]:
X.iloc[:,1:].corr()

Unnamed: 0,TV,radio,newspaper
TV,1.0,0.054809,0.056648
radio,0.054809,1.0,0.354104
newspaper,0.056648,0.354104,1.0


Correlation Matrix

In [9]:
# WE can use this also
corr_matrix = dataset.corr()
corr_matrix

Unnamed: 0,TV,radio,newspaper,sales
TV,1.0,0.054809,0.056648,0.782224
radio,0.054809,1.0,0.354104,0.576223
newspaper,0.056648,0.354104,1.0,0.228299
sales,0.782224,0.576223,0.228299,1.0


In [10]:
# Checking collinearity w.r.t Target variable

corr_matrix['sales'].sort_values(ascending=False)

sales        1.000000
TV           0.782224
radio        0.576223
newspaper    0.228299
Name: sales, dtype: float64

## So in this dataset, Clearly there is no correlation exists.

----------------------------------------------------------------------------------------------------------

----------------------------------------------------------------------------------------------------------
# Using another dataset

In [11]:
data_sal = pd.read_csv('Salary_Data.csv')

In [12]:
data_sal.head()

Unnamed: 0,YearsExperience,Age,Salary
0,1.1,21.0,39343
1,1.3,21.5,46205
2,1.5,21.7,37731
3,2.0,22.0,43525
4,2.2,22.2,39891


In [13]:
X = data_sal[['YearsExperience','Age']]
y = data_sal['Salary']


In [14]:
# Adding Constant 1, as we have to do this in OLS method
X = sm.add_constant(X)
X

Unnamed: 0,const,YearsExperience,Age
0,1.0,1.1,21.0
1,1.0,1.3,21.5
2,1.0,1.5,21.7
3,1.0,2.0,22.0
4,1.0,2.2,22.2
5,1.0,2.9,23.0
6,1.0,3.0,23.0
7,1.0,3.2,23.3
8,1.0,3.2,23.3
9,1.0,3.7,23.6


In [15]:
# fit OLS model to interepret
model2 = sm.OLS(y,X).fit()

In [16]:
model2.summary()

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.96
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,323.9
Date:,"Tue, 03 Aug 2021",Prob (F-statistic):,1.35e-19
Time:,00:06:04,Log-Likelihood:,-300.35
No. Observations:,30,AIC:,606.7
Df Residuals:,27,BIC:,610.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6661.9872,2.28e+04,-0.292,0.773,-5.35e+04,4.02e+04
YearsExperience,6153.3533,2337.092,2.633,0.014,1358.037,1.09e+04
Age,1836.0136,1285.034,1.429,0.165,-800.659,4472.686

0,1,2,3
Omnibus:,2.695,Durbin-Watson:,1.711
Prob(Omnibus):,0.26,Jarque-Bera (JB):,1.975
Skew:,0.456,Prob(JB):,0.372
Kurtosis:,2.135,Cond. No.,626.0


## Now note that,
Std_err has become a huge value

To confirm correlation, we'll check correlation matrix

In [17]:
corr_mat = X.iloc[:,1:].corr()
corr_mat

Unnamed: 0,YearsExperience,Age
YearsExperience,1.0,0.987258
Age,0.987258,1.0


Means we have 98% correlation between these two features

In [18]:
# We can check this as well 
corr_mat2 = data_sal.corr()
corr_mat2['Salary'].sort_values(ascending=False)

Salary             1.000000
YearsExperience    0.978242
Age                0.974530
Name: Salary, dtype: float64

So now by observing this we can conclude that,
One of these feature is sufficient to Predict the Salary

So we can drop one feature,

As column 'Age' has more P value we can drop this column, and train our model again  
