# Random Forest Classifier

# Importing essential libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
import warnings
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
warnings.filterwarnings('ignore')
%matplotlib inline 

  import pandas.util.testing as tm


# Reading the Dataset

In [4]:
diabetes = pd.read_csv("OriginalDataset.csv")

In [5]:
diabetes

Unnamed: 0,age,sex,residence,Systollic Blood Pressure,Distollic Blood Pressure,BMI,Hypertension,Family History of diabetes,alcohol intake,Currently a smoker,Obesity,Physically Inactive,occupation,diabetes status
0,13,0,1,143.0,108.0,19.477147,0.0,0.0,0.0,0.0,0,0,1,Low-Risk
1,14,1,1,,,17.928215,0.0,0.0,0.0,0.0,0,0,3,Low-Risk
2,14,1,1,125.0,83.0,20.897959,1.0,0.0,0.0,0.0,0,0,1,High-Risk
3,15,1,1,101.0,58.0,32.342449,0.0,0.0,0.0,0.0,1,1,1,Low-Risk
4,15,0,1,121.0,80.0,22.278827,0.0,0.0,0.0,0.0,0,0,3,High-Risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2494,60,1,0,140.0,72.0,17.065557,1.0,0.0,0.0,0.0,0,0,1,Low-Risk
2495,60,1,0,108.0,68.0,20.957171,0.0,0.0,0.0,0.0,0,0,1,High-Risk
2496,45,0,1,163.0,78.0,18.091200,1.0,0.0,1.0,0.0,0,0,1,High-Risk
2497,30,1,0,128.0,80.0,20.123400,1.0,0.0,1.0,0.0,0,0,1,High-Risk


# Shuffle the data to avoid biasness

In [6]:
diabetes = shuffle(diabetes,random_state = 22)

# Filling the null values with the forward fill (ffill)

In [7]:
diabetes = pd.DataFrame(diabetes).fillna(method='ffill')

# Importing a preprocessing module

In [8]:
from sklearn.preprocessing import LabelEncoder

# Convert target label to numerical Data
le = LabelEncoder()
diabetes['diabetes status'] = le.fit_transform(diabetes['diabetes status'])
#diabetes['occupation'] = le.fit_transform(diabetes['occupation'])

# Separate training features from target

In [9]:
from sklearn.model_selection import train_test_split
X = diabetes.drop(columns=['diabetes status','occupation'])
y = diabetes['diabetes status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

# Model Selection

In [10]:
# Creating Random Forest Model
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=20)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
classifier.fit(X_test,y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Optimize model paramaters 

#Set your parameter grid here
param_grid = {}
grid_model = GridSearchCV(classifier, param_grid)
grid_model.fit(X_train, y_train)
grid_model.fit(X_test,y_test)
print(grid_model.best_params_)

{}


In [13]:
# import evaluation metrics 
from sklearn.metrics import confusion_matrix, accuracy_score

#evaluate the model
y_pred = classifier.predict(X_test)

# Get error rate
print("Error rate of Random Forest classifier: ", 1 - accuracy_score(y_test, y_pred))

# Get confusion matrix
confusion_matrix(y_pred, y_test)


Error rate of Random Forest classifier:  0.0040000000000000036


array([[148,   0],
       [  3, 599]])

In [14]:
accuracy = accuracy_score(y_pred, y_test) * 100
print('Accuracy: %f' % accuracy)

Accuracy: 99.600000


In [18]:
print(X.columns)

Index(['age', 'sex', 'residence', 'Systollic Blood Pressure',
       'Distollic Blood Pressure', 'BMI', 'Hypertension',
       'Family History of diabetes', 'alcohol intake', 'Currently a smoker ',
       'Obesity', 'Physically Inactive'],
      dtype='object')


# Creating a pickle file for the classifier

In [17]:
# save the classifier model using joblib
import joblib
joblib.dump(classifier, "classifier_model.pkl")

#persist the list of columns from training
model_columns = list(X.columns)
joblib.dump(model_columns, 'model_columns.pkl')

#using pickle
#pickle.dump(classifier, "diabettes.pkl")


['model_columns.pkl']

In [32]:
# load the model from disk
loaded_classifier = joblib.load("classifier_model.pkl")

#test loaded model
loaded_classifier.predict(X_test)

array([1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,