# Random Forest Classifier

# Importing essential libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
import warnings
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
warnings.filterwarnings('ignore')
%matplotlib inline 

# Reading the Dataset

In [5]:
diabetes = pd.read_csv("OriginalDataset.csv")

In [6]:
diabetes

Unnamed: 0,age,sex,residence,Systollic Blood Pressure,Distollic Blood Pressure,BMI,Hypertension,Family History of diabetes,alcohol intake,Currently a smoker,Obesity,Physically Inactive,occupation,diabetes status
0,13,0,1,143.0,108.0,19.477147,0.0,0.0,0.0,0.0,0,0,1,Low-Risk
1,14,1,1,,,17.928215,0.0,0.0,0.0,0.0,0,0,3,Low-Risk
2,14,1,1,125.0,83.0,20.897959,1.0,0.0,0.0,0.0,0,0,1,High-Risk
3,15,1,1,101.0,58.0,32.342449,0.0,0.0,0.0,0.0,1,1,1,Low-Risk
4,15,0,1,121.0,80.0,22.278827,0.0,0.0,0.0,0.0,0,0,3,High-Risk
5,15,1,1,103.0,64.0,28.507522,0.0,0.0,0.0,0.0,1,1,1,High-Risk
6,15,0,1,134.0,91.0,20.861120,0.0,1.0,,0.0,0,0,3,High-Risk
7,17,0,1,107.0,61.0,25.315454,0.0,1.0,1.0,0.0,1,1,1,Low-Risk
8,17,0,0,90.0,61.0,15.822159,1.0,0.0,0.0,0.0,0,0,1,High-Risk
9,17,1,1,107.0,76.0,22.370343,0.0,0.0,0.0,1.0,0,0,1,High-Risk


# Shuffle the data to avoid biasness

In [7]:
diabetes = shuffle(diabetes,random_state = 22)

# Filling the null values with the forward fill (ffill)

In [8]:
diabetes = pd.DataFrame(diabetes).fillna(method='ffill')

# Importing a preprocessing module

In [9]:
from sklearn.preprocessing import LabelEncoder

# Convert target label to numerical Data
le = LabelEncoder()
diabetes['diabetes status'] = le.fit_transform(diabetes['diabetes status'])
#diabetes['occupation'] = le.fit_transform(diabetes['occupation'])

# Separate training features from target

In [10]:
from sklearn.model_selection import train_test_split
X = diabetes.drop(columns=['diabetes status','occupation'])
y = diabetes['diabetes status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

# Model Selection

In [11]:
# Creating Random Forest Model
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=20)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
classifier.fit(X_test,y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Optimize model paramaters 

#Set your parameter grid here
param_grid = {}
grid_model = GridSearchCV(classifier, param_grid)
grid_model.fit(X_train, y_train)
grid_model.fit(X_test,y_test)
print(grid_model.best_params_)

{}


In [14]:
# import evaluation metrics 
from sklearn.metrics import confusion_matrix, accuracy_score

#evaluate the model
y_pred = classifier.predict(X_test)

# Get error rate
print("Error rate of Random Forest classifier: ", 1 - accuracy_score(y_test, y_pred))

# Get confusion matrix
confusion_matrix(y_pred, y_test)


Error rate of Random Forest classifier:  0.002666666666666706


array([[150,   1],
       [  1, 598]], dtype=int64)

In [15]:
accuracy = accuracy_score(y_pred, y_test) * 100
print('Accuracy: %f' % accuracy)

Accuracy: 99.733333


# Creating a pickle file for the classifier

In [16]:
# Creating a pickle file for the classifier
filename = 'diabetes-prediction-rfc-model.pkl'
pickle.dump(classifier, open(filename, 'wb'))