In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/codebasics/py/master/ML/9_decision_tree/Exercise/titanic.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df=df[['Survived','Pclass','Sex','Age','Fare']]

In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


## Cleaning the DataSet

In [7]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

In [8]:
df.groupby("Sex")['Age'].mean()

Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64

In [9]:
df['Age']=df.groupby("Sex")['Age'].transform(lambda x: x.fillna(x.mean()))

## Changing Categorical Column to Numerical one

In [10]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Sex'] = le.fit_transform(df.Sex)

In [11]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


In [12]:
X=df[['Pclass','Sex','Age','Fare']]
Y=df['Survived']

## Splitting the Data

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

In [18]:
print("Shape of Training Data is {}\nand the Testing Data is {}".format(x_train.shape,x_test.shape))

Shape of Training Data is (623, 4)
and the Testing Data is (268, 4)


## Import Classifier and Fit the Data

In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf=RandomForestClassifier(random_state=42,
                                     max_depth=4,
                                     n_estimators=100,
                                    n_jobs=-1,
                                    oob_score=True)
%time
classifier_rf.fit(x_train,y_train)

CPU times: user 15 µs, sys: 1 µs, total: 16 µs
Wall time: 30.8 µs


RandomForestClassifier(max_depth=4, n_jobs=-1, oob_score=True, random_state=42)

In [23]:
## checking the oob score
classifier_rf.oob_score_

0.8170144462279294

## Hyperparameter Tunning for RandomForest

In [33]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [5,10,25,30,50,100,200,300]
}

In [34]:
from sklearn.model_selection import GridSearchCV
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="accuracy")

In [35]:
%time
grid_search.fit(x_train, y_train)

CPU times: user 15 µs, sys: 1e+03 ns, total: 16 µs
Wall time: 31.2 µs
Fitting 4 folds for each of 240 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   56.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  5.3min finished


GridSearchCV(cv=4, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100, 200],
                         'n_estimators': [5, 10, 25, 30, 50, 100, 200, 300]},
             scoring='accuracy', verbose=1)

In [36]:
grid_search.best_score_

0.8170181968569066

In [37]:
rf_best=grid_search.best_estimator_
rf_best

RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_jobs=-1,
                       random_state=42)

In [38]:
imp_def=pd.DataFrame({
          'Columns':x_train.columns,
          'imp':rf_best.feature_importances_
         })

In [41]:
imp_def.sort_values(by="imp",ascending=False)

Unnamed: 0,Columns,imp
1,Sex,0.453081
3,Fare,0.232793
2,Age,0.176564
0,Pclass,0.137561
