In [9]:
# import the usual libraries
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from random import randint
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score

In [17]:
df = pd.read_csv("data/data.csv",names=["A"+str(x) for x in range(1,17,1)])
df2 = df.copy()
df2.rename(columns={'A16':'target'},inplace=True) # rename last column to target since it gound level truth
for col in df2.columns:
    if df2[col].dtype == 'object':
        df2[col] = df2[col].astype('category')
        df2[col] = df2[col].cat.codes
df2 = df2.sample(frac=1) # shuffle data
df2.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
501,2,227,5.0,2,1,3,8,3.5,1,1,10,1,0,0,0,0
640,2,188,2.75,2,1,7,1,2.5,0,0,0,1,0,82,200,1
210,2,231,5.875,2,1,3,4,10.0,1,1,14,1,0,128,0,0
268,2,330,1.54,2,1,11,8,0.125,1,0,0,1,0,90,0,0
56,2,77,0.0,2,1,3,8,0.085,1,0,0,0,0,0,0,0


Part 1: Using the RandomForest Classifier provided by the sklearn library

 1. Initialize the classifier with default arbitrary paramenters
 2. Train the classifier
 3. Determine the recall score of the classifier


In [18]:
#lets split our date into train and test
X = df2[df2.columns[:-1]]
y = df2['target']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [19]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_2pred = random_forest.predict(X_test)
print("Recall score is : {}".format(recall_score(y_test,y_2pred)))

Recall score is : 0.8571428571428571


Part 2: Using the RandomizedSearchCV module provided by the sklearn library

  1.  Do parameter tuning to obtain the optimal parameters to initialize the RandomForest Classifier. The parameters to tune are as follow:
        i. n_estimators
       ii. max_features
      iii. max_depth
       iv. min_samples_split
        v. min_sample_leaf
       vi. bootstrap

  2.  Determine the recall score of the classifier


In [20]:
clf = RandomForestClassifier()
param_dist = {"n_estimators":[9,10,11],
              "max_depth": [3,None],
              "max_features": ['auto','log2',None],
              "min_samples_split": [2,3,4],
              "min_samples_leaf": [1,2,3],
              "bootstrap": [True, False]}
rand_search = RandomizedSearchCV(clf,param_distributions=param_dist)
rand_search.fit(X_train,y_train)
rand_search.best_params_

{'bootstrap': True,
 'max_depth': 3,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 10}

In [22]:
random_forest_new = RandomForestClassifier(bootstrap=True,max_depth=3,max_features='auto',min_samples_leaf=2,\
                                       min_samples_split=3,n_estimators=10)
random_forest_new.fit(X_train, y_train)
y_2pred_new = random_forest_new.predict(X_test)
print("Recall score is : {}".format(recall_score(y_test,y_2pred_new)))

Recall score is : 0.922077922077922
