# __Carseats DataSet Using RandomForestClassifier__

### __Importing Data & Libraries__

In [236]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
import seaborn as sns
import statsmodels.api as sm

from matplotlib.pyplot import subplots
from sklearn import preprocessing
from sklearn import utils
from sklearn.linear_model import LassoLarsCV, LassoCV, ElasticNet
from sklearn.datasets import make_regression
from sklearn.metrics import r2_score, auc
from sklearn.model_selection import train_test_split, KFold  
from sklearn.feature_selection import RFE


In [237]:
df = pd.read_csv("Carseats.csv")

In [238]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


### __Cleaning Data__

In [239]:
df['Urban_Yes'] = pd.Series(np.zeros(df.shape[0]))
df.loc[df['Urban']=='Yes', 'Urban_Yes'] = 1

df['US_Yes'] = pd.Series(np.zeros(df.shape[0]))
df.loc[df['US']=='Yes', 'US_Yes'] = 1

df['ShelveLoc_rank'] = pd.Series(np.zeros(df.shape[0]))
df.loc[df['ShelveLoc']=='Good', 'ShelveLoc_rank'] = 2
df.loc[df['ShelveLoc']=='Medium', 'ShelveLoc_rank'] = 1

In [240]:
df

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,Urban_Yes,US_Yes,ShelveLoc_rank
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes,1.0,1.0,0.0
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes,1.0,1.0,2.0
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes,1.0,1.0,1.0
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes,1.0,1.0,1.0
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes,1.0,1.0,2.0
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes,0.0,1.0,1.0
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes,1.0,1.0,1.0
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes,1.0,1.0,0.0


In [241]:
del df["ShelveLoc"]
del df["Urban"]
del df["US"]

In [242]:
df.isnull().sum()

Sales             0
CompPrice         0
Income            0
Advertising       0
Population        0
Price             0
Age               0
Education         0
Urban_Yes         0
US_Yes            0
ShelveLoc_rank    0
dtype: int64

In [243]:
df.describe()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education,Urban_Yes,US_Yes,ShelveLoc_rank
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,7.496325,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9,0.705,0.645,0.9725
std,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528,0.456614,0.479113,0.672961
min,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0,0.0,0.0,0.0
25%,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0,0.0,0.0,1.0
50%,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0,1.0,1.0,1.0
75%,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0,1.0,1.0,1.0
max,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0,1.0,1.0,2.0


In [244]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sales           400 non-null    float64
 1   CompPrice       400 non-null    int64  
 2   Income          400 non-null    int64  
 3   Advertising     400 non-null    int64  
 4   Population      400 non-null    int64  
 5   Price           400 non-null    int64  
 6   Age             400 non-null    int64  
 7   Education       400 non-null    int64  
 8   Urban_Yes       400 non-null    float64
 9   US_Yes          400 non-null    float64
 10  ShelveLoc_rank  400 non-null    float64
dtypes: float64(4), int64(7)
memory usage: 34.5 KB


In [245]:
df['Sales'] = df['Sales'].astype('int64')
df['Urban_Yes'] = df['Urban_Yes'].astype('int64')
df['US_Yes'] = df['US_Yes'].astype('int64')
df['ShelveLoc_rank'] = df['ShelveLoc_rank'].astype('int64')

### __Train/Testing Set Split__

In [246]:
X = df.iloc[:, 1:11]
y = df.iloc[:, 0]

In [247]:
X.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,Urban_Yes,US_Yes,ShelveLoc_rank
0,138,73,11,276,120,42,17,1,1,0
1,111,48,16,260,83,65,10,1,1,2
2,113,35,10,269,80,59,12,1,1,1
3,117,100,4,466,97,55,14,1,1,1
4,141,64,3,340,128,38,13,1,0,0


In [248]:
y.head()

0     9
1    11
2    10
3     7
4     4
Name: Sales, dtype: int64

In [249]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state = 21)

### __RandomForestClassifier Integration To Find Best Features__

In [250]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(criterion= "gini",
                            max_depth= 8, 
                            min_samples_split= 10, 
                            random_state= 21)

In [251]:
clf.fit(X_train, y_train)

In [252]:
clf.feature_importances_

array([0.11506984, 0.1294934 , 0.10339521, 0.12909298, 0.18345058,
       0.13668295, 0.06725481, 0.02358471, 0.02682217, 0.08515335])

In [253]:
df.columns

Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
       'Age', 'Education', 'Urban_Yes', 'US_Yes', 'ShelveLoc_rank'],
      dtype='object')

In [254]:
y_pred = clf.predict(X_test)

In [255]:
y_pred

array([ 5,  6,  4,  8,  4,  4,  9,  5, 10,  5,  9,  5,  5,  6,  5,  5,  4,
        7, 11,  8,  8, 10,  6,  5,  7,  5,  8,  4,  5,  5,  5,  5,  9,  7,
        7,  5,  5, 10,  5,  7,  5, 10, 11,  6,  5,  5,  8,  6,  6,  6, 11,
        7, 10,  5,  5, 10,  7,  5,  5,  8,  8, 11,  7,  6,  5,  5,  6,  4,
        7,  5,  6,  8,  9,  5,  5,  5,  6,  8, 11,  5,  9,  8, 11,  7,  9,
        5,  7,  5,  5, 10,  9,  5, 11,  9, 11,  5,  5,  5,  5,  5,  9,  5,
        8,  8,  6,  6,  9,  5,  7,  5,  5,  5,  6,  5,  8,  6,  8,  8, 10,
        6])

In [256]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 2, 8, 3, 2, 2, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 9, 5, 0, 2, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 7, 4, 2, 1, 2, 1, 0, 0, 0, 0],
       [0, 0, 0, 2, 6, 2, 4, 4, 3, 2, 0, 0, 0, 0],
       [0, 0, 0, 1, 5, 1, 2, 2, 1, 2, 3, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 2, 1, 1, 2, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [257]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.18333333333333332

In [258]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X_train, y_train, cv = 10)



array([0.14285714, 0.25      , 0.17857143, 0.10714286, 0.21428571,
       0.21428571, 0.25      , 0.14285714, 0.25      , 0.14285714])