## Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

## Load the dataset

In [2]:
df = pd.read_csv('C:/Users/Mehedi Hassan Galib/Desktop/R/Cardiotocographic.csv')
df.head()

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,120,0.0,0.0,0.0,0.0,0.0,0.0,73,0.5,43,...,62,126,2,0,120,137,121,73,1,2
1,132,0.00638,0.0,0.00638,0.00319,0.0,0.0,17,2.1,0,...,68,198,6,1,141,136,140,12,0,1
2,133,0.003322,0.0,0.008306,0.003322,0.0,0.0,16,2.1,0,...,68,198,5,1,141,135,138,13,0,1
3,134,0.002561,0.0,0.007682,0.002561,0.0,0.0,16,2.4,0,...,53,170,11,0,137,134,137,13,1,1
4,132,0.006515,0.0,0.008143,0.0,0.0,0.0,16,2.4,0,...,53,170,9,0,137,136,138,11,1,1


## Shape of the dataset

In [3]:
df.shape

(2126, 22)

## Checking if there are any missing values

In [4]:
df.isnull().sum()

LB          0
AC          0
FM          0
UC          0
DL          0
DS          0
DP          0
ASTV        0
MSTV        0
ALTV        0
MLTV        0
Width       0
Min         0
Max         0
Nmax        0
Nzeros      0
Mode        0
Mean        0
Median      0
Variance    0
Tendency    0
NSP         0
dtype: int64

## Splitting into explanatory and response variables

In [5]:
X = pd.DataFrame(df.iloc[:, 0:21])
y = df['NSP']

## Splitting into train and test set

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

<br/>

## Random Forest Classifier with all variables

In [23]:
rf = RandomForestClassifier()
model = rf.fit(X_train, y_train)

In [24]:
y_pred = rf.predict(X_test)

In [25]:
rf.score(X_test, y_test)

0.9389671361502347

<br/>

## Without using RFE
we can also choose the best variables with 'feature_importances_'. Values close to 1 are the best suited variables for the model. But removing one variable can change the significance of other variables. That's why there's always a risk.

In [26]:
print(rf.feature_importances_)

[0.03609536 0.05297009 0.02221743 0.04323044 0.0089301  0.00150667
 0.04554608 0.11826022 0.13275033 0.11842481 0.04414219 0.03624892
 0.03532332 0.02797143 0.01852948 0.00475516 0.06018399 0.0984824
 0.05282891 0.03407833 0.00752434]


<br/>


## So we will go for the less risky solution : The Mighty RFE
step = 10 - To speed up the RFE process, we can set the 'step' parameter to RFE. And the 10 means on each iterations 10 least important features will be dropped out.

In [27]:
rfe = RFE(estimator = RandomForestClassifier(), n_features_to_select = 18, step = 10, verbose = 1)
rfe.fit(X_train, y_train)

Fitting estimator with 21 features.


RFE(estimator=RandomForestClassifier(), n_features_to_select=18, step=10,
    verbose=1)

In [28]:
y1_pred = rfe.predict(X_test)

In [29]:
rfe.score(X_test, y_test)

0.9553990610328639

##### Conclussion:  It's a quite improvement from the first model.

<br/>

## Columns RFE choose for the model

In [14]:
print(X.columns[rfe.support_])

Index(['LB', 'AC', 'UC', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min',
       'Mode', 'Mean', 'Median', 'Variance'],
      dtype='object')


<br/>
<br/>

## Note:
I discussed RFE more in another [notebook](https://github.com/galibce003/Machine-Learning-with-Python/blob/master/Machine%20Learing%20Basics/Recursive%20Feature%20Elimination.ipynb).

<br/>
<br/>

## Feel free to share your thoughts and if you find it helpful, please upvote. Thanks!