# **Prediction of onset of diabetes using diagnostic features**
Pima Indian Diabetes dataset has been used in this project.



> *Data import and Preprocessing*



In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('diabetes.csv')
data.head()

data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

> *Replacing 0 values*



In [3]:
#Replacing 0 values

df1 = data.loc[data['Outcome'] == 1]
df2 = data.loc[data['Outcome'] == 0]
df1 = df1.replace({'BloodPressure':0}, np.median(df1['BloodPressure']))
df1 = df1.replace({'BMI':0}, np.median(df1['BMI']))
df1 = df1.replace({'Glucose':0}, np.median(df1['Glucose']))
df1 = df1.replace({'Insulin':0}, np.median(df1['Insulin']))
df1 = df1.replace({'SkinThickness':0}, np.median(df1['SkinThickness']))
df2 = df2.replace({'BloodPressure':0}, np.median(df2['BloodPressure']))
df2 = df2.replace({'BMI':0}, np.median(df2['BMI']))
df2 = df2.replace({'Glucose':0}, np.median(df2['Glucose']))
df2 = df2.replace({'Insulin':0}, np.median(df2['Insulin']))
df2 = df2.replace({'SkinThickness':0}, np.median(df2['SkinThickness']))


In [4]:
dataframe = [df1, df2]
data = pd.concat(dataframe)

data.head()
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.677083,72.378906,27.43099,91.783854,32.433919,0.471876,33.240885,0.348958
std,3.369578,30.464161,12.104431,9.32146,108.121136,6.880664,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,0.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,21.0,39.0,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,27.0,39.0,32.05,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


> *Balancing Minority Class Data*

In [5]:
data.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [6]:
pip install imbalanced-learn 



In [7]:
from collections import Counter
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X = data.drop(['Outcome'], axis=1)
y = data['Outcome']
counter = Counter(y)
print(counter)
X, y = oversample.fit_resample(X, y)
counter = Counter(y)
print(counter)
a = pd.DataFrame(X)
b = pd.DataFrame(y)
data1 = pd.concat([a,b], axis=1)
data1.columns = data.columns
data1.Outcome.value_counts()



Counter({0: 500, 1: 268})
Counter({1: 500, 0: 500})




1    500
0    500
Name: Outcome, dtype: int64

> *Scaling Data*

In [8]:
from sklearn import preprocessing
stdscaler = preprocessing.StandardScaler()
data2=pd.DataFrame(stdscaler.fit_transform(data1),columns = data.columns)
scaler = preprocessing.MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(data2),columns = data.columns)

> *Running 10 fold CV without any filter*

In [11]:
from numpy import mean
from numpy import std
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
X = df_scaled.drop(['Outcome'], axis=1)
y = df_scaled.Outcome
cv = KFold(n_splits=10, random_state=1, shuffle=True)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
RFC = RandomForestClassifier()
KNN=KNeighborsClassifier()
NB=GaussianNB()
scores = cross_val_score(RFC, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('RFC Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))
scores = cross_val_score(KNN, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('KNN Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))
scores = cross_val_score(NB, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Naive Bayes Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))

RFC Accuracy 10 fold: 91.4000 (0.016)
KNN Accuracy 10 fold: 79.6000 (0.029)
Naive Bayes Accuracy 10 fold: 74.5000 (0.030)


#Filter Ensemble


> *Chi-Square Test*

In [12]:
import scipy.stats as stats
def chi_sqr(test_column,output_column):
    observed_value = pd.crosstab(test_column,output_column)
    val = stats.chi2_contingency(observed_value)
    expected_value=val[3]
    for o,e in zip(observed_value.values,expected_value):
        chi_2 = sum([(o-e)**2./e])       
    chi2_stat = np.sum(chi_2)
    return chi2_stat
    
df1 = df_scaled.drop(['Outcome'], axis=1)
top_n1 =[]
for i in df1.columns.values:
    top_n1.append(chi_sqr(df1[i],df_scaled['Outcome']))    
col_name1 = np.array(X.columns)
a1 = pd.DataFrame(top_n1)
b1 = pd.DataFrame(col_name1)
info1 = pd.concat([a1,b1], axis=1)
info1.columns = ['Score','Features']
top1 = info1.nsmallest(50,'Score')
ft1 = np.array(top1['Features'])
top1
ft1

array(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype=object)

> *Mutual Information*

In [15]:
from sklearn.feature_selection import mutual_info_classif
X = df_scaled.drop(['Outcome'], axis=1)
y = df_scaled.Outcome
res = mutual_info_classif(X,y,discrete_features=True)
col_name2 = np.array(X.columns)
a2 = pd.DataFrame(res)
b2 = pd.DataFrame(col_name2)
info2 = pd.concat([a2,b2], axis=1)
info2.columns = ['Info_Gain','Features']
top2 = info2.nlargest(50,'Info_Gain')
ft2 = np.array(top2['Features'])
ft2


array(['Insulin', 'DiabetesPedigreeFunction', 'BMI', 'Glucose',
       'SkinThickness', 'Age', 'BloodPressure', 'Pregnancies'],
      dtype=object)

In [18]:
pip install skrebate

Collecting skrebate
  Downloading https://files.pythonhosted.org/packages/ee/04/1fa0073d145cc38e1396ef21e8c6be998d152555c64d1d54db9ce8e2c62c/skrebate-0.62.tar.gz
Building wheels for collected packages: skrebate
  Building wheel for skrebate (setup.py) ... [?25l[?25hdone
  Created wheel for skrebate: filename=skrebate-0.62-cp37-none-any.whl size=29256 sha256=b6242e3e2977585d038fff5b6c87ae7ecbeb7162faf45b5a3f305eaa23b0df9f
  Stored in directory: /root/.cache/pip/wheels/6b/0e/2f/540392932d3a01060f04c242d3b9bee80dfd87f63d80b3a9a0
Successfully built skrebate
Installing collected packages: skrebate
Successfully installed skrebate-0.62


> *ReliefF*

In [19]:
from sklearn.pipeline import make_pipeline
from skrebate import ReliefF
from sklearn.pipeline import make_pipeline
from skrebate import ReliefF
features, classes = df_scaled.drop('Outcome', axis=1).values, df_scaled['Outcome'].values
arr = features.astype('float64')
fs = ReliefF()
fs.fit(arr, classes)
top_n3=[]
names=[]
for feature_name, feature_score in zip(df_scaled.drop('Outcome', axis=1).columns, fs.feature_importances_):
    top_n3.append(feature_score)
    names.append(feature_name)    
a3 = pd.DataFrame(top_n3)
b3 =pd.DataFrame(names)
info3 = pd.concat([a3,b3], axis=1)
info3.columns = ['Score','Features'] 
top3 = info3.nlargest(50,'Score')
ft3 = np.array(top3['Features'])
ft3


array(['Glucose', 'Insulin', 'Age', 'SkinThickness', 'BMI', 'Pregnancies',
       'DiabetesPedigreeFunction', 'BloodPressure'], dtype=object)

# Selecting top n features

> *Union Function* 

In [21]:
from functools import reduce
def top_fs_union(n):
    f1 = ft1[:n]
    f2 = ft2[:n]
    f3 = ft3[:n]
    return(reduce(np.union1d, (f1,f2,f3)))

> *Union function call with selected top n*

In [26]:
def assemble_run(n,classifier):
  top_fs=top_fs_union(n)
  top_fs=np.append(top_fs,['Outcome'], axis=0)
  dfs = df_scaled.loc[:,top_fs]
  X = dfs.drop(['Outcome'], axis=1)
  y = dfs['Outcome']
  model = classifier()
  scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  print('Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))

# Checking Accuracy

> *RANDOM FOREST CLASSIFIER*

In [27]:
#Feature Subset by Union of Best Features from each

RFC1 = assemble_run(1,RandomForestClassifier)
RFC1 = assemble_run(2,RandomForestClassifier)
RFC1 = assemble_run(3,RandomForestClassifier)
RFC1 = assemble_run(4,RandomForestClassifier)
RFC1 = assemble_run(5,RandomForestClassifier)
RFC1 = assemble_run(6,RandomForestClassifier)
RFC2 = assemble_run(7,RandomForestClassifier)
RFC3 = assemble_run(8,RandomForestClassifier)

Accuracy 10 fold: 88.4000 (0.037)
Accuracy 10 fold: 90.1000 (0.034)
Accuracy 10 fold: 91.7000 (0.022)
Accuracy 10 fold: 92.1000 (0.023)
Accuracy 10 fold: 91.9000 (0.024)
Accuracy 10 fold: 91.4000 (0.024)
Accuracy 10 fold: 91.8000 (0.023)
Accuracy 10 fold: 92.5000 (0.021)


> *KNN CLASSIFIER*

In [28]:
#Feature Subset by Union of Best Features from each

KNN1 = assemble_run(5,KNeighborsClassifier)
KNN1 = assemble_run(10,KNeighborsClassifier)
KNN1 = assemble_run(15,KNeighborsClassifier)
KNN1 = assemble_run(20,KNeighborsClassifier)
KNN1 = assemble_run(25,KNeighborsClassifier)
KNN1 = assemble_run(30,KNeighborsClassifier)
KNN1 = assemble_run(35,KNeighborsClassifier)
KNN1 = assemble_run(40,KNeighborsClassifier)
KNN1 = assemble_run(45,KNeighborsClassifier)
KNN1 = assemble_run(50,KNeighborsClassifier)


Accuracy 10 fold: 79.6000 (0.029)
Accuracy 10 fold: 79.6000 (0.029)
Accuracy 10 fold: 79.6000 (0.029)
Accuracy 10 fold: 79.6000 (0.029)
Accuracy 10 fold: 79.6000 (0.029)
Accuracy 10 fold: 79.6000 (0.029)
Accuracy 10 fold: 79.6000 (0.029)
Accuracy 10 fold: 79.6000 (0.029)
Accuracy 10 fold: 79.6000 (0.029)
Accuracy 10 fold: 79.6000 (0.029)


> *NAIVE BAYES CLASSIFIER*

In [29]:
#Feature Subset by Union of Best Features from each
NB1 = assemble_run(5,GaussianNB)
NB1 = assemble_run(10,GaussianNB)
NB1 = assemble_run(15,GaussianNB)
NB1 = assemble_run(20,GaussianNB)
NB1 = assemble_run(25,GaussianNB)
NB1 = assemble_run(30,GaussianNB)
NB1 = assemble_run(35,GaussianNB)
NB1 = assemble_run(40,GaussianNB)
NB1 = assemble_run(45,GaussianNB)
NB1 = assemble_run(50,GaussianNB)

Accuracy 10 fold: 74.5000 (0.030)
Accuracy 10 fold: 74.5000 (0.030)
Accuracy 10 fold: 74.5000 (0.030)
Accuracy 10 fold: 74.5000 (0.030)
Accuracy 10 fold: 74.5000 (0.030)
Accuracy 10 fold: 74.5000 (0.030)
Accuracy 10 fold: 74.5000 (0.030)
Accuracy 10 fold: 74.5000 (0.030)
Accuracy 10 fold: 74.5000 (0.030)
Accuracy 10 fold: 74.5000 (0.030)


> *Accuracy check for individal feature set*

In [30]:
def fs_run(classifier,feature_set,no_of_attr):
  ft=feature_set[:no_of_attr]
  top_fs=np.append(ft,['Outcome'], axis=0)
  dfs = df_scaled.loc[:,top_fs]
  X = dfs.drop(['Outcome'], axis=1)
  y = dfs['Outcome']
  model = classifier()
  scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  print('Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))

In [31]:
print('Accuracy for Feature set 1 with best attributes:')
fs_run(RandomForestClassifier,ft1,4)
fs_run(KNeighborsClassifier,ft1,4)
fs_run(GaussianNB,ft1,4)
print('Accuracy for Feature set 2 with best attributes:')
fs_run(RandomForestClassifier,ft2,4)
fs_run(KNeighborsClassifier,ft2,4)
fs_run(GaussianNB,ft2,4)
print('Accuracy for Feature set 3 with best attributes :')
fs_run(RandomForestClassifier,ft3,4)
fs_run(KNeighborsClassifier,ft3,4)
fs_run(GaussianNB,ft3,4)


Accuracy for Feature set 1 with best attributes:
Accuracy 10 fold: 83.5000 (0.036)
Accuracy 10 fold: 79.2000 (0.049)
Accuracy 10 fold: 75.0000 (0.028)
Accuracy for Feature set 2 with best attributes:
Accuracy 10 fold: 89.4000 (0.034)
Accuracy 10 fold: 81.6000 (0.041)
Accuracy 10 fold: 73.3000 (0.018)
Accuracy for Feature set 3 with best attributes :
Accuracy 10 fold: 91.1000 (0.023)
Accuracy 10 fold: 83.9000 (0.022)
Accuracy 10 fold: 74.0000 (0.029)
