<a href="https://colab.research.google.com/github/dajebbar/FreeCodeCamp-python-data-analysis/blob/main/BackwardEliminationMethod_for_FeatureSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install opendatasets --upgrade --quiet

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import opendatasets as od
plt.style.use('fivethirtyeight')
%matplotlib inline

In [3]:
url = 'https://www.kaggle.com/farhanmd29/50-startups'
od.download(url)

Skipping, found downloaded files in "./50-startups" (use force=True to force download)


In [4]:
startup = pd.read_csv('./50-startups/50_Startups.csv')
startup.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
# Clean columns
def clean(col):
  return col.replace('R&D Spend', 'RD_Spend').replace(
      'Marketing Spend', 'Marketing_Spend')

startup = startup.rename(columns=clean)
startup.columns


Index(['RD_Spend', 'Administration', 'Marketing_Spend', 'State', 'Profit'], dtype='object')

In [6]:
data = startup.drop(columns=['Profit'])
target = startup.Profit

In [7]:
data.head()

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [8]:
# Encode categorical features
from sklearn.preprocessing import (
    StandardScaler,
    LabelEncoder,
    OneHotEncoder,
)

labEncoder = LabelEncoder()

In [9]:
data.iloc[:, -1] = labEncoder.fit_transform(data.iloc[:, -1])
data.head(2)

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,State
0,165349.2,136897.8,471784.1,2
1,162597.7,151377.59,443898.53,0


In [10]:
onehotencoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
onehotencoder.fit(data.loc[:, ['State']])
onehotencoder.categories_

[array([0, 1, 2])]

In [11]:
cat_features = data.loc[:, ['State']].columns
encoded_cols = list(onehotencoder.get_feature_names_out(cat_features))
print(encoded_cols)

['State_0', 'State_1', 'State_2']


In [12]:
data[encoded_cols] = onehotencoder.transform(data[cat_features])

In [13]:
data.head()

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,State,State_0,State_1,State_2
0,165349.2,136897.8,471784.1,2,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,0,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,1,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,2,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,1,0.0,1.0,0.0


In [14]:
num_features = ['RD_Spend',	'Administration',	'Marketing_Spend']
scaler = StandardScaler()
data[num_features] = scaler.fit_transform(data[num_features] )
data.head()

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,State,State_0,State_1,State_2
0,2.016411,0.560753,2.153943,2,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,0,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,1,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,2,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,1,0.0,1.0,0.0


In [15]:
from sklearn.model_selection import (
    train_test_split,

)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=['State']),
    target,
    test_size=.2,
    random_state=0,
)


In [16]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [17]:
from sklearn.metrics import r2_score

y_pred = regressor.predict(X_test)
score = r2_score(y_test, y_pred)
print(f'r2_score: {score * 100:.2f}')

r2_score: 93.47


## Building the optimal model using the backward elimination

In [18]:
import statsmodels.api as sm 

X = np.append(
    arr=np.ones((len(data), 1)).astype(int), 
    values= data.drop(columns=['State']),
    axis=1,
)

X

  import pandas.util.testing as tm


array([[ 1.00000000e+00,  2.01641149e+00,  5.60752915e-01,
         2.15394309e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [ 1.00000000e+00,  1.95586034e+00,  1.08280658e+00,
         1.92360040e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 1.00000000e+00,  1.75436374e+00, -7.28257028e-01,
         1.62652767e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00],
       [ 1.00000000e+00,  1.55478369e+00, -9.63646307e-02,
         1.42221024e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [ 1.00000000e+00,  1.50493720e+00, -1.07991935e+00,
         1.28152771e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00],
       [ 1.00000000e+00,  1.27980001e+00, -7.76239071e-01,
         1.25421046e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [ 1.00000000e+00,  1.34006641e+00,  9.32147208e-01,
        -6.88149930e-01,  1.00000000e+00,  0.00000000e+00,
         0.0000000

In [19]:
data_opt = data.assign(
    intercept = np.ones((50, 1), dtype=int)
)
data_opt.drop(columns=['State'], inplace=True)
data_opt.head()

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,State_0,State_1,State_2,intercept
0,2.016411,0.560753,2.153943,0.0,0.0,1.0,1
1,1.95586,1.082807,1.9236,1.0,0.0,0.0,1
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0,1
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0,1
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0,1


In [20]:
regressor_OLS = sm.OLS(endog=target, exog=data_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sat, 05 Mar 2022",Prob (F-statistic):,1.34e-27
Time:,13:59:37,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RD_Spend,3.663e+04,2108.775,17.369,0.000,3.24e+04,4.09e+04
Administration,-748.9975,1448.705,-0.517,0.608,-3668.671,2170.676
Marketing_Spend,3266.2152,2075.251,1.574,0.123,-916.178,7448.608
State_0,2.795e+04,1932.496,14.464,0.000,2.41e+04,3.18e+04
State_1,2.815e+04,1993.572,14.121,0.000,2.41e+04,3.22e+04
State_2,2.791e+04,1913.989,14.582,0.000,2.41e+04,3.18e+04
intercept,8.401e+04,1001.609,83.877,0.000,8.2e+04,8.6e+04

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,9690000000000000.0


In [21]:
regressor_OLS = sm.OLS(
    endog=target, 
    exog=data_opt.drop(columns=['Administration'])
).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,215.8
Date:,"Sat, 05 Mar 2022",Prob (F-statistic):,9.720000000000001e-29
Time:,14:09:08,Log-Likelihood:,-525.53
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1071.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RD_Spend,3.62e+04,1928.741,18.771,0.000,3.23e+04,4.01e+04
Marketing_Spend,3601.6435,1955.116,1.842,0.072,-336.162,7539.449
State_0,2.796e+04,1916.569,14.590,0.000,2.41e+04,3.18e+04
State_1,2.81e+04,1975.275,14.228,0.000,2.41e+04,3.21e+04
State_2,2.794e+04,1897.221,14.729,0.000,2.41e+04,3.18e+04
intercept,8.401e+04,993.420,84.567,0.000,8.2e+04,8.6e+04

0,1,2,3
Omnibus:,14.64,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.037
Skew:,-0.938,Prob(JB):,2.7e-05
Kurtosis:,5.565,Cond. No.,1.05e+16


In [22]:
regressor_OLS = sm.OLS(
    endog=target, 
    exog=data_opt.drop(columns=['Marketing_Spend'])
).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,205.0
Date:,"Sat, 05 Mar 2022",Prob (F-statistic):,2.9e-28
Time:,14:09:30,Log-Likelihood:,-526.75
No. Observations:,50,AIC:,1064.0
Df Residuals:,45,BIC:,1073.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RD_Spend,3.912e+04,1414.054,27.665,0.000,3.63e+04,4.2e+04
Administration,-1461.8228,1398.492,-1.045,0.301,-4278.531,1354.885
State_0,2.766e+04,1954.678,14.149,0.000,2.37e+04,3.16e+04
State_1,2.875e+04,1989.016,14.453,0.000,2.47e+04,3.28e+04
State_2,2.762e+04,1935.940,14.265,0.000,2.37e+04,3.15e+04
intercept,8.402e+04,1017.899,82.543,0.000,8.2e+04,8.61e+04

0,1,2,3
Omnibus:,14.275,Durbin-Watson:,1.197
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.26
Skew:,-0.953,Prob(JB):,6.57e-05
Kurtosis:,5.369,Cond. No.,1.54e+16


In [23]:
regressor_OLS = sm.OLS(
    endog=target, 
    exog=data_opt.drop(columns=['Marketing_Spend','Administration'])
).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,272.4
Date:,"Sat, 05 Mar 2022",Prob (F-statistic):,2.76e-29
Time:,14:12:00,Log-Likelihood:,-527.35
No. Observations:,50,AIC:,1063.0
Df Residuals:,46,BIC:,1070.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RD_Spend,3.876e+04,1373.286,28.226,0.000,3.6e+04,4.15e+04
State_0,2.762e+04,1956.262,14.117,0.000,2.37e+04,3.16e+04
State_1,2.878e+04,1990.776,14.457,0.000,2.48e+04,3.28e+04
State_2,2.763e+04,1937.871,14.256,0.000,2.37e+04,3.15e+04
intercept,8.402e+04,1018.923,82.461,0.000,8.2e+04,8.61e+04

0,1,2,3
Omnibus:,13.418,Durbin-Watson:,1.122
Prob(Omnibus):,0.001,Jarque-Bera (JB):,17.605
Skew:,-0.907,Prob(JB):,0.00015
Kurtosis:,5.271,Cond. No.,9680000000000000.0


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    data_opt.drop(columns=['Marketing_Spend','Administration', 'intercept']),
    target,
    test_size=.2,
    random_state=0,
)

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
score = r2_score(y_test, y_pred)
print(f'r2_score: {score * 100:.2f}')

r2_score: 94.72
