<a href="https://colab.research.google.com/github/dajebbar/FreeCodeCamp-python-data-analysis/blob/main/BackwardEliminationMethod_for_FeatureSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install opendatasets --upgrade --quiet

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import opendatasets as od
plt.style.use('fivethirtyeight')
%matplotlib inline

In [3]:
url = 'https://www.kaggle.com/farhanmd29/50-startups'
od.download(url)

Skipping, found downloaded files in "./50-startups" (use force=True to force download)


In [4]:
startup = pd.read_csv('./50-startups/50_Startups.csv')
startup.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
# Clean columns
def clean(col):
  return col.replace('R&D Spend', 'RD_Spend').replace(
      'Marketing Spend', 'Marketing_Spend')

startup = startup.rename(columns=clean)
startup.columns


Index(['RD_Spend', 'Administration', 'Marketing_Spend', 'State', 'Profit'], dtype='object')

In [6]:
data = startup.drop(columns=['Profit'])
target = startup.Profit

In [7]:
data.head()

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [8]:
# Encode categorical features
from sklearn.preprocessing import (
    StandardScaler,
    LabelEncoder,
    OneHotEncoder,
)

labEncoder = LabelEncoder()

In [9]:
data.iloc[:, -1] = labEncoder.fit_transform(data.iloc[:, -1])
data.head(2)

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,State
0,165349.2,136897.8,471784.1,2
1,162597.7,151377.59,443898.53,0


In [10]:
onehotencoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
onehotencoder.fit(data.loc[:, ['State']])
onehotencoder.categories_

[array([0, 1, 2])]

In [11]:
cat_features = data.loc[:, ['State']].columns
encoded_cols = list(onehotencoder.get_feature_names_out(cat_features))
print(encoded_cols)

['State_0', 'State_1', 'State_2']


In [12]:
data[encoded_cols] = onehotencoder.transform(data[cat_features])

In [13]:
data.head()

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,State,State_0,State_1,State_2
0,165349.2,136897.8,471784.1,2,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,0,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,1,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,2,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,1,0.0,1.0,0.0


In [14]:
num_features = ['RD_Spend',	'Administration',	'Marketing_Spend']
scaler = StandardScaler()
data[num_features] = scaler.fit_transform(data[num_features] )
data.head()

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,State,State_0,State_1,State_2
0,2.016411,0.560753,2.153943,2,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,0,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,1,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,2,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,1,0.0,1.0,0.0


In [15]:
from sklearn.model_selection import (
    train_test_split,

)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=['State']),
    target,
    test_size=.2,
    random_state=0,
)


In [16]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [17]:
from sklearn.metrics import r2_score

y_pred = regressor.predict(X_test)
score = r2_score(y_test, y_pred)
print(f'r2_score: {score * 100:.2f}')

r2_score: 93.47


## Building the optimal model using the backward elimination

In [23]:
import statsmodels.api as sm 

X = np.append(
    arr=np.ones((len(data), 1)).astype(int), 
    values= data.drop(columns=['State']),
    axis=1,
)

X

array([[ 1.00000000e+00,  2.01641149e+00,  5.60752915e-01,
         2.15394309e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [ 1.00000000e+00,  1.95586034e+00,  1.08280658e+00,
         1.92360040e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 1.00000000e+00,  1.75436374e+00, -7.28257028e-01,
         1.62652767e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00],
       [ 1.00000000e+00,  1.55478369e+00, -9.63646307e-02,
         1.42221024e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [ 1.00000000e+00,  1.50493720e+00, -1.07991935e+00,
         1.28152771e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00],
       [ 1.00000000e+00,  1.27980001e+00, -7.76239071e-01,
         1.25421046e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [ 1.00000000e+00,  1.34006641e+00,  9.32147208e-01,
        -6.88149930e-01,  1.00000000e+00,  0.00000000e+00,
         0.0000000

In [27]:
data = data.assign(
    intercept = np.ones((50, 1), dtype=int)
)
data.drop(columns=['State'], inplace=True)
data.head()

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,State_0,State_1,State_2,intercept
0,2.016411,0.560753,2.153943,0.0,0.0,1.0,1
1,1.95586,1.082807,1.9236,1.0,0.0,0.0,1
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0,1
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0,1
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0,1
