## Step -1 Business Problem Understanding

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('Advertising.csv')
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230100,37800,69200,22100
1,44500,39300,45100,10400
2,17200,45900,69300,9300
3,151500,41300,58500,18500
4,180800,10800,58400,12900


## everything is same as we did in multiple linear regression

In [4]:
X = df.drop(columns='sales')
y = df['sales']

## Step - 4: Modeling

### findout the best random_state value 

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

Train = []
Test = []
CV = []

for i in range(0, 100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    model = LinearRegression()
    model.fit(X_train, y_train)

    ypred_train = model.predict(X_train)
    ypred_test = model.predict(X_test)

    Train.append(model.score(X_train, y_train))
    # print('Train R2:', r2_score(y_train, ypred_train))
    Test.append(model.score(X_test, y_test))
    # print('Test R2:', r2_score(y_test, ypred_test))

    CV.append(cross_val_score(model, X_train, y_train, cv=5).mean())


em = pd.DataFrame({'Train':Train, 'Test':Test, 'CV':CV})
gm = em[(abs(em['Train']-em['Test']) <= 0.05) & (abs(em['Test']-em['CV']) <=0.05)]
print('best random state number:', gm[gm['Test']==gm['Test'].max()].index.to_list())

best random state number: [99]


# <font color = aqua> Lasso Regresion </font> 

### Apply Hyperparameter tuning for Lasso Regression

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

estimator = Lasso()
param_grid = {'alpha':list(range(1,101))}
# Identifying the best value of the parameter within given values for the given data
model_hp = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
model_hp.fit(X_train, y_train)
model_hp.best_params_

{'alpha': 100}

### Build Lasso Model using best hyperparameters

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90)

from sklearn.linear_model import Lasso
lasso_best = Lasso(alpha=100)
lasso_best.fit(X_train, y_train)
print('Intercept:', lasso_best.intercept_)
print('Coefficients:', lasso_best.coef_)

# Prediction 
ypred_train = lasso_best.predict(X_train)
ypred_test = lasso_best.predict(X_test)

# Evaluation
from sklearn.metrics import r2_score
print('Train R2:', r2_score(y_train, ypred_train))
print('Test R2:', r2_score(y_test, ypred_test))
from sklearn.model_selection import cross_val_score
print('CV Score:', cross_val_score(lasso_best, X_train, y_train, cv=5).mean())

Intercept: 2869.884788645759
Coefficients: [ 0.04671155  0.18390745 -0.00055154]
Train R2: 0.9031056838460997
Test R2: 0.8515396968986041
CV Score: 0.8773235609800224


In [8]:
import statsmodels.formula.api as smf
model1 = smf.ols('y_train ~ X_train', data=X_train).fit()
model1.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.903
Model:,OLS,Adj. R-squared:,0.901
Method:,Least Squares,F-statistic:,484.7
Date:,"Fri, 02 Feb 2024",Prob (F-statistic):,8.13e-79
Time:,00:24:21,Log-Likelihood:,-1416.9
No. Observations:,160,AIC:,2842.0
Df Residuals:,156,BIC:,2854.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2869.8795,343.426,8.357,0.000,2191.515,3548.244
X_train[0],0.0467,0.002,30.035,0.000,0.044,0.050
X_train[1],0.1839,0.010,18.763,0.000,0.165,0.203
X_train[2],-0.0006,0.007,-0.081,0.936,-0.014,0.013

0,1,2,3
Omnibus:,50.118,Durbin-Watson:,1.957
Prob(Omnibus):,0.0,Jarque-Bera (JB):,117.819
Skew:,-1.339,Prob(JB):,2.61e-26
Kurtosis:,6.24,Cond. No.,445000.0


### Final model including TV and radio only 

In [9]:
X = X.drop(columns=['newspaper'])

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90)

from sklearn.linear_model import Lasso
lasso_best = Lasso(alpha=100)
lasso_best.fit(X_train, y_train)
print('Intercept:', lasso_best.intercept_)
print('Coefficients:', lasso_best.coef_)

# Prediction 
ypred_train = lasso_best.predict(X_train)
ypred_test = lasso_best.predict(X_test)

# Evaluation
from sklearn.metrics import r2_score
print('Train R2:', r2_score(y_train, ypred_train))
print('Test R2:', r2_score(y_test, ypred_test))
from sklearn.model_selection import cross_val_score
print('CV Score:', cross_val_score(lasso_best, X_train, y_train, cv=5).mean())

Intercept: 2860.8931047028946
Coefficients: [0.04669654 0.18366798]
Train R2: 0.9031016203039576
Test R2: 0.8515289854863508
CV Score: 0.8798412742124236


### Predictions on a new data

In [11]:
test_df = pd.DataFrame({'TV':[149000], 'radio':[22000], 'newspaper':[12000]})
test_df

Unnamed: 0,TV,radio,newspaper
0,149000,22000,12000


In [12]:
X = test_df.drop(columns=['newspaper'])

In [13]:
lasso_best.predict(X)

array([13859.37336832])