# Import necessary pacakages

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import arff

# For SGD classifier
from sklearn.linear_model import SGDClassifier

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Read the data

In [2]:
data_arr,meta_data = arff.loadarff("datasets/dataset_diabetes.arff")

In [3]:
meta_data

Dataset: pima_diabetes
	preg's type is numeric
	plas's type is numeric
	pres's type is numeric
	skin's type is numeric
	insu's type is numeric
	mass's type is numeric
	pedi's type is numeric
	age's type is numeric
	class's type is nominal, range is ('tested_negative', 'tested_positive')

In [4]:
df_data = pd.DataFrame(data_arr)

# Data Inspection

<h3>Content</h3>
<p>Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.</p>
<ul>
<li>Pregnancies: Number of times pregnant </li>
<li>Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test </li>
<li>BloodPressure: Diastolic blood pressure (mm Hg) </li>
<li>SkinThickness: Triceps skin fold thickness (mm) </li>
<li>Insulin: 2-Hour serum insulin (mu U/ml) </li>
<li>BMI: Body mass index (weight in kg/(height in m)^2) </li>
<li>DiabetesPedigreeFunction: Diabetes pedigree function </li>
<li>Age: Age (years) </li>
<li>Outcome: Class variable (0 or 1)</li>
</ul>

# Data Preparation for Modelling

In [5]:
df_data["class_cat"] = pd.get_dummies(df_data["class"],drop_first = True)
df_data.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class,class_cat
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,b'tested_positive',True
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,b'tested_negative',False
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,b'tested_positive',True
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,b'tested_negative',False
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,b'tested_positive',True


In [10]:
train_col_sel = set(df_data.columns) - set(["class","class_cat"])

X = df_data.reindex(columns = train_col_sel)
y = df_data["class_cat"]

In [11]:
X.describe()

Unnamed: 0,plas,skin,pres,age,pedi,insu,mass,preg
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,120.894531,20.536458,69.105469,33.240885,0.471876,79.799479,31.992578,3.845052
std,31.972618,15.952218,19.355807,11.760232,0.331329,115.244002,7.88416,3.369578
min,0.0,0.0,0.0,21.0,0.078,0.0,0.0,0.0
25%,99.0,0.0,62.0,24.0,0.24375,0.0,27.3,1.0
50%,117.0,23.0,72.0,29.0,0.3725,30.5,32.0,3.0
75%,140.25,32.0,80.0,41.0,0.62625,127.25,36.6,6.0
max,199.0,99.0,122.0,81.0,2.42,846.0,67.1,17.0


In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y)

# Model

## SGD Classifier

In [8]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train,y_train)

In [9]:
dict_target_class = {0:"Non_Diabetic",1: "Diabetic"}

In [10]:
df_data.loc[X_test.index].head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class,class_cat
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,b'tested_negative',False
208,1.0,96.0,64.0,27.0,87.0,33.2,0.289,21.0,b'tested_negative',False
620,2.0,112.0,86.0,42.0,160.0,38.4,0.246,28.0,b'tested_negative',False
535,4.0,132.0,0.0,0.0,0.0,32.9,0.302,23.0,b'tested_positive',True
223,7.0,142.0,60.0,33.0,190.0,28.8,0.687,61.0,b'tested_negative',False


In [54]:
df_data.loc[df_data["class_cat"] == False].iloc[1]

preg                        1.0
plas                       89.0
pres                       66.0
skin                       23.0
insu                       94.0
mass                       28.1
pedi                      0.167
age                        21.0
class        b'tested_negative'
class_cat                 False
Name: 3, dtype: object

In [46]:
X_test.iloc[0]

pedi      0.245
insu    112.000
plas    121.000
age      30.000
pres     72.000
mass     26.200
skin     23.000
preg      5.000
Name: 765, dtype: float64

In [48]:
sgd_clf.predict(X_test.iloc[[0]])

array([ True])

In [12]:
y_pred = sgd_clf.predict(X_test)

In [13]:

confusion_matrix(y_test, y_pred)

array([[  6, 119],
       [  2,  65]])

In [14]:
accuracy_score(y_test,y_pred)

0.3697916666666667

## Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)


In [16]:
forest_clf.fit(X_train,y_train)

In [17]:
y_pred_forest = forest_clf.predict(X_test)

In [18]:
confusion_matrix(y_test, y_pred_forest)

array([[103,  22],
       [ 32,  35]])

In [19]:
accuracy_score(y_test,y_pred_forest)

0.71875

## Decision Trees

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

In [21]:
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X_test,y_test)

In [22]:
y_pred_d_tree = tree_clf.predict(X_test)

In [23]:
confusion_matrix(y_test, y_pred_d_tree)

array([[98, 27],
       [18, 49]])

In [24]:
accuracy_score(y_test,y_pred_d_tree)

0.765625

# Saving Models

In [31]:
import shelve as sh

In [33]:
with sh.open("./model_UI/models_data/models") as md:
    md["SGD"] = sgd_clf
    md["Decision_Tree"] = tree_clf
    md["Random_Forest"] = forest_clf

In [None]:
sh.open()
    

In [34]:
X.head()

Unnamed: 0,pedi,insu,plas,age,pres,mass,skin,preg
0,0.627,0.0,148.0,50.0,72.0,33.6,35.0,6.0
1,0.351,0.0,85.0,31.0,66.0,26.6,29.0,1.0
2,0.672,0.0,183.0,32.0,64.0,23.3,0.0,8.0
3,0.167,94.0,89.0,21.0,66.0,28.1,23.0,1.0
4,2.288,168.0,137.0,33.0,40.0,43.1,35.0,0.0


In [37]:
X.pedi.min(),X.pedi.max()

(np.float64(0.078), np.float64(2.42))