## Step - 1 Business Problem Understanding
- **what is the relationship between each advertising channel (TV, Radio, Newspaper) and sales**

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('Advertising.csv')
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230100,37800,69200,22100
1,44500,39300,45100,10400
2,17200,45900,69300,9300
3,151500,41300,58500,18500
4,180800,10800,58400,12900


## everything is same as we did in multiple linear regression

In [4]:
X = df.drop(columns='sales')
y = df['sales']

# Step -4 Modeling 
we will perform polynomial on both train data or test data

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

Train = []
Test = []
CV = []

for i in range(0, 100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    model = LinearRegression()
    model.fit(X_train, y_train)

    ypred_train = model.predict(X_train)
    ypred_test = model.predict(X_test)

    Train.append(model.score(X_train, y_train))
    # print('Train R2:', r2_score(y_train, ypred_train))
    Test.append(model.score(X_test, y_test))
    # print('Test R2:', r2_score(y_test, ypred_test))

    CV.append(cross_val_score(model, X_train, y_train, cv=5).mean())


em = pd.DataFrame({'Train':Train, 'Test':Test, 'CV':CV})
gm = em[(abs(em['Train']-em['Test']) <= 0.05) & (abs(em['Test']-em['CV']) <=0.05)]
print('best random state number:', gm[gm['Test']==gm['Test'].max()].index.to_list())

best random state number: [99]


# <font color = aqua> Polynomial Regresion </font> 

## Hyperparameter Tuning
#### Choosing the best polynomial degree for given dataset

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures


train_r2 = []
test_r2 = []
for i in range(1, 10):
    # data preprocessing on train data
    polynomial_converter = PolynomialFeatures(degree=i)
    X_train_poly = pd.DataFrame(polynomial_converter.fit_transform(X_train))

    # modelling on train data
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(X_train_poly, y_train)


    # prediction and evaluation on train data
    ypred_train = model.predict(X_train_poly)
    #print('Train R2:', i, model.score(X_train_poly, y_train))
    train_r2.append(model.score(X_train_poly, y_train))

    # transformation on test data
    X_test_poly = pd.DataFrame(polynomial_converter.fit_transform(X_test))

    # prediction and evaluation on test data
    ypred_test = model.predict(X_test_poly)
    #print('Test R2:', i, model.score(X_test_poly, y_test))
    test_r2.append(model.score(X_test_poly, y_test))

In [7]:
train_r2

[0.8906288862925659,
 0.9866683486567523,
 0.9919477498440662,
 0.9848303639971143,
 0.9720950905621792,
 0.9820606596510867,
 0.9762257213001088,
 0.8512235852604148,
 0.9343409793417258]

In [8]:
test_r2

[0.9200846680148507,
 0.9844905668728798,
 0.9917498941105054,
 0.9701214532745682,
 0.9376875745936584,
 0.8883176938749953,
 0.5865878124892758,
 -0.6118354124362697,
 -2.904037216868102]

##### between both train_r2 and test_r2, we can see index number 2 is having highest degree

### Build a model

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

In [10]:
from sklearn.preprocessing import PolynomialFeatures
polynomial_converter = PolynomialFeatures(degree=2, include_bias=True)

# data preprocessing on train data
X_train_poly = pd.DataFrame(polynomial_converter.fit_transform(X_train))
# data preprocessing on test data
X_test_poly = pd.DataFrame(polynomial_converter.fit_transform(X_test))

# modelling on train data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_poly, y_train)
print('Intercept:', model.intercept_)
print('Coefficient:', model.coef_)

# prediction
ypred_train = model.predict(X_train_poly)
ypred_test = model.predict(X_test_poly)

# Evaluation 
from sklearn.metrics import mean_squared_error, r2_score
print('Train RSME:', np.sqrt(mean_squared_error(y_train, ypred_train)))
print('Train R2:', r2_score(y_train, ypred_train))
# Cross Validation 
from sklearn.model_selection import cross_val_score
print('Cross Validation Score:', cross_val_score(model, X_train, y_train, cv=5).mean())

# on test data, we are not required to do fit 
# (because B0 and B1 we find for the train daat only)

# Evaluation on test data
from sklearn.metrics import mean_squared_error, r2_score
print('Test RSME:', np.sqrt(mean_squared_error(y_test, ypred_test)))
print('Test R2:', r2_score(y_test, ypred_test))

Intercept: 5089.126947762041
Coefficient: [ 0.00000000e+00  5.15901737e-02  1.54418707e-02  9.97897761e-03
 -1.08039513e-07  1.10610802e-06 -5.04606220e-08  1.19792603e-07
  1.98539515e-07 -4.06233596e-08]
Train RSME: 601.7940982044499
Train R2: 0.9866683486567523
Cross Validation Score: 0.8745389851558864
Test RSME: 630.5318070675827
Test R2: 0.9844905668728798


In [11]:
import statsmodels.formula.api as smf
model1 = smf.ols('y_train ~ X_train', data=X_train).fit()
model1.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.891
Model:,OLS,Adj. R-squared:,0.889
Method:,Least Squares,F-statistic:,423.4
Date:,"Fri, 02 Feb 2024",Prob (F-statistic):,1.0199999999999999e-74
Time:,01:08:49,Log-Likelihood:,-1419.4
No. Observations:,160,AIC:,2847.0
Df Residuals:,156,BIC:,2859.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2904.4859,368.721,7.877,0.000,2176.156,3632.816
X_train[0],0.0455,0.002,28.499,0.000,0.042,0.049
X_train[1],0.1882,0.010,18.941,0.000,0.169,0.208
X_train[2],0.0008,0.007,0.114,0.909,-0.012,0.014

0,1,2,3
Omnibus:,53.148,Durbin-Watson:,1.945
Prob(Omnibus):,0.0,Jarque-Bera (JB):,134.731
Skew:,-1.386,Prob(JB):,5.5399999999999995e-30
Kurtosis:,6.539,Cond. No.,475000.0


### Final model including TV and radio only 

In [12]:
X = X.drop(columns=['newspaper'])

In [18]:
from sklearn.preprocessing import PolynomialFeatures
final_polynomial_converter = PolynomialFeatures(degree=2, include_bias=True)

# data preprocessing on train data
X_train_poly = pd.DataFrame(final_polynomial_converter.fit_transform(X_train))
# data preprocessing on test data
X_test_poly = pd.DataFrame(final_polynomial_converter.fit_transform(X_test))

# modelling on train data
from sklearn.linear_model import LinearRegression
model2 = LinearRegression()
model2.fit(X_train_poly, y_train)
print('Intercept:', model2.intercept_)
print('Coefficient:', model2.coef_)

# prediction
ypred_train = model2.predict(X_train_poly)
ypred_test = model2.predict(X_test_poly)

# Evaluation 
from sklearn.metrics import mean_squared_error, r2_score
print('Train RSME:', np.sqrt(mean_squared_error(y_train, ypred_train)))
print('Train R2:', r2_score(y_train, ypred_train))
# Cross Validation 
from sklearn.model_selection import cross_val_score
print('Cross Validation Score:', cross_val_score(model2, X_train, y_train, cv=5).mean())

# on test data, we are not required to do fit 
# (because B0 and B1 we find for the train daat only)

# Evaluation on test data
from sklearn.metrics import mean_squared_error, r2_score
print('Test RSME:', np.sqrt(mean_squared_error(y_test, ypred_test)))
print('Test R2:', r2_score(y_test, ypred_test))

Intercept: 5248.345923156818
Coefficient: [ 0.00000000e+00  5.07736679e-02  2.19990110e-02 -1.08145346e-07
  1.07163648e-06  2.73401133e-07]
Train RSME: 620.4133821095758
Train R2: 0.9858306342003403
Cross Validation Score: 0.8803158236620214
Test RSME: 597.3721467029545
Test R2: 0.9860789541409101


# Prediction on new data
- Our next ad campaign will have a total spend of 149k on TV, 22k on Radio, and 12k on Newspaper Ads, how many units could we expect to sell as a result of this?

In [19]:
test_df = pd.DataFrame({'TV':[149000], 'radio':[22000], 'newspaper':[12000]})
test_df

Unnamed: 0,TV,radio,newspaper
0,149000,22000,12000


In [20]:
X = test_df.drop(columns=['newspaper'])

In [21]:
# transformation
transform_data = final_polynomial_converter.fit_transform(X)
transform_data

array([[1.0000e+00, 1.4900e+05, 2.2000e+04, 2.2201e+10, 3.2780e+09,
        4.8400e+08]])

In [22]:
model2.predict(transform_data)

array([14541.81638092])