In [37]:
# import pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_curve, auc
from sklearn import metrics 

from sklearn.tree import export_graphviz
import graphviz
from io import StringIO
from IPython.display import Image  
import pydotplus
import os

%matplotlib inline

In [7]:
# list for column headers
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# load data
df = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", names=names)

* explore the dataset
    - head
    - shape

In [8]:
df.columns

Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'], dtype='object')

* create X and y (the goal is to predict column **class** based on other variables)

In [9]:
x = df.drop('class', axis = 1)
y = df['class']

In [83]:
x.shape

(768, 8)

* split data set to train set and test set

In [14]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3)

* import RandomForestClassifier from sklearn

In [None]:
# done above

* create model

In [16]:
RF = RandomForestClassifier()

* fit training set with default parameters

In [17]:
RF.fit(X_train,y_train)

RandomForestClassifier()

* predict X_test

In [18]:
y_pred = RF.predict(X_test)

In [19]:
metrics.accuracy_score(y_test, y_pred)

0.7532467532467533

* import roc_auc_score and confusion_matrix from sklearn

In [None]:
#done above

* print confusion matrix

In [21]:
con_mat = metrics.confusion_matrix(y_test,y_pred)
con_mat

array([[123,  21],
       [ 36,  51]])

* print AUC

In [28]:
fpr, tpr, thresholds = metrics.roc_curve(list(y_test), y_pred)
metrics.auc(fpr, tpr)

0.7201867816091954

* import GridSearchCV from sklearn

In [30]:
# done above

* create grid (optimize for number of trees and max depth in one tree)

In [70]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 5)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
random_grid = {'n_estimators': n_estimators, 'max_depth': max_depth}
n_folds = 10

In [71]:
n_estimators

[100, 325, 550, 775, 1000]

In [72]:
max_depth

[10, 35, 60, 85, 110]

In [73]:
RF = RandomForestClassifier()

* fit training data with grid search

In [74]:
rg_gs = GridSearchCV(RF,random_grid,cv=n_folds)

In [75]:
rg_gs.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 35, 60, 85, 110],
                         'n_estimators': [100, 325, 550, 775, 1000]})

In [76]:
print(rg_gs.best_params_)
print(rg_gs.best_score_)

{'max_depth': 110, 'n_estimators': 325}
0.7764150943396226


* print confusion matrix with the best model

In [77]:
RF = RandomForestClassifier(max_depth = 110, n_estimators = 325)

In [78]:
RF.fit(X_train,y_train)

RandomForestClassifier(max_depth=110, n_estimators=325)

In [79]:
y_pred = RF.predict(X_test)

In [80]:
metrics.accuracy_score(y_test, y_pred)

0.7489177489177489

In [81]:
con_mat = metrics.confusion_matrix(y_test,y_pred)
con_mat

array([[124,  20],
       [ 38,  49]])

* print AUC with the best model

In [82]:
fpr, tpr, thresholds = metrics.roc_curve(list(y_test), y_pred)
metrics.auc(fpr, tpr)

0.7121647509578544

- is the model better than default?

In [None]:
# not yet