In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Preprocessing

### Importing the dataset

In [2]:
# importing datset

df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Checking null entries

In [3]:
# null entries count

df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


### Checking duplicate entries 

In [5]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Length: 768, dtype: bool

In [6]:
df.duplicated().sum()

0

### Extracting feature and target values

In [7]:
# list of features

features = [x for x in df.columns if x not in df.columns[-1]]
features

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [8]:
# target label

target = df.columns[-1]
target

'Outcome'

In [9]:
# Extracting features

df_features = df.drop('Outcome', axis = 1)
df_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [10]:
# Converting dataframe to numpy array

x = np.array(df_features)   # feature array
x[:5,:]

array([[6.000e+00, 1.480e+02, 7.200e+01, 3.500e+01, 0.000e+00, 3.360e+01,
        6.270e-01, 5.000e+01],
       [1.000e+00, 8.500e+01, 6.600e+01, 2.900e+01, 0.000e+00, 2.660e+01,
        3.510e-01, 3.100e+01],
       [8.000e+00, 1.830e+02, 6.400e+01, 0.000e+00, 0.000e+00, 2.330e+01,
        6.720e-01, 3.200e+01],
       [1.000e+00, 8.900e+01, 6.600e+01, 2.300e+01, 9.400e+01, 2.810e+01,
        1.670e-01, 2.100e+01],
       [0.000e+00, 1.370e+02, 4.000e+01, 3.500e+01, 1.680e+02, 4.310e+01,
        2.288e+00, 3.300e+01]])

In [11]:
# Extracting target values

y = np.array(df['Outcome'])
y = np.expand_dims(y,axis=1) # reshaping to 2d array
y[:5]

array([[1],
       [0],
       [1],
       [0],
       [1]], dtype=int64)

### Splitting dataset into train, validation and test set

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train, x_temp, y_train, y_temp = train_test_split(x,y, test_size=0.4, random_state = 1)
x_cv, x_test, y_cv, y_test = train_test_split(x_temp, y_temp, test_size=0.4, random_state = 1)

# Model Evaluation and Selection
Training error and validation error is computed for models with different layers and units. Finally, the errors are compared to get the best model
<br>
<br>
Note: Error is the fraction of misclassification on the given set

In [14]:
import tensorflow as tf  # importing TensorFlow module
from tensorflow import keras  # importing Keras module from TensorFlow
from tensorflow.keras.layers import Dense  # importing Dense layer class from Keras
from tensorflow.keras.models import Sequential  # importing Sequential model class from Keras
from tensorflow.keras.regularizers import l2  # importing L2 regularization from Keras regularizers

In [15]:
# function to build models with different number of layers and units

def build_models():
    model_1 = Sequential([
        keras.layers.Normalization(input_shape=(x_train.shape[1],)),   # Normalizing input features
        Dense(units=20, activation='relu', kernel_regularizer = l2(0.01), name='L1'), # using Ridge regularization
        Dense(units=10, activation='relu', kernel_regularizer = l2(0.01), name='L2'),
        Dense(units=1, activation='linear', kernel_regularizer = l2(0.01), name='L3')
    ], name = 'model_1')
        
    model_2 = Sequential([
        keras.layers.Normalization(input_shape=(x_train.shape[1],)),
        Dense(units=25, activation='relu', kernel_regularizer = l2(0.01), name='L1'),
        Dense(units=15, activation='relu', kernel_regularizer = l2(0.01), name='L2'),
        Dense(units=1, activation='linear', kernel_regularizer = l2(0.01), name='L3')
    ], name = 'model_2')

        
    model_3 = Sequential([
        keras.layers.Normalization(input_shape=(x_train.shape[1],)),
        Dense(units=20, activation='relu', kernel_regularizer = l2(0.01), name='L1'),
        Dense(units=12, activation='relu', kernel_regularizer = l2(0.01), name='L2'),
        Dense(units=12, activation='relu', kernel_regularizer = l2(0.01), name='L3'),
        Dense(units=1, activation='linear', kernel_regularizer = l2(0.01), name='L4')
    ], name = 'model_3')
        
    model_4 = Sequential([
        keras.layers.Normalization(input_shape=(x_train.shape[1],)),
        Dense(units=32, activation='relu', kernel_regularizer = l2(0.01), name='L1'),
        Dense(units=16, activation='relu', kernel_regularizer = l2(0.01), name='L2'),
        Dense(units=8, activation='relu', kernel_regularizer = l2(0.01), name='L3'),
        Dense(units=4, activation='relu', kernel_regularizer = l2(0.01), name='L4'),
        Dense(units=12, activation='relu', kernel_regularizer = l2(0.01), name='L5'),
        Dense(units=1, activation='linear', kernel_regularizer = l2(0.01), name='L6')
    ], name = 'model_4')
    
    model_list=[model_1, model_2, model_3, model_4]
        
    return model_list

In [16]:
# building different models

models = build_models()

In [17]:
# Training error of each model
train_error_list= []

# Validation error of each model
val_error_list= []


# training each model
for model in models:
    model.compile(
        loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
        optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01 )
    )
    
    print(f'training... {model.name}')
    model.fit(
        x_train, y_train,
        epochs=200,
        verbose=0  # not displaying training progress
    )
    
    # Computing training error
    yhat = model.predict(x_train)
    yhat_sigmoid = tf.nn.sigmoid(yhat)
    
    threshold = 0.5
    
    yhat = np.where(yhat_sigmoid >= threshold, 1, 0)
    train_error = yhat != y_train
    train_error = np.mean(train_error)
    train_error_list.append(train_error)  # computing training error
    
    # Computing validation error
    yhat = model.predict(x_cv)
    yhat_sigmoid = tf.nn.sigmoid(yhat)
    
    yhat = np.where(yhat_sigmoid >= threshold, 1, 0)
    val_error = yhat != y_cv
    val_error = np.mean(val_error)
    val_error_list.append(val_error)
        

training... model_1
training... model_2
training... model_3
training... model_4


In [18]:
print('Displaying Training  and Validation error for each models\n')
for i,j in enumerate(models):
    print(f'Training error for {j.name} = {train_error_list[i]}')
    print(f'Validation error for {j.name} = {val_error_list[i]}\n')


Displaying Training  and Validation error for each models

Training error for model_1 = 0.2217391304347826
Validation error for model_1 = 0.25

Training error for model_2 = 0.23478260869565218
Validation error for model_2 = 0.29891304347826086

Training error for model_3 = 0.25869565217391305
Validation error for model_3 = 0.25

Training error for model_4 = 0.24130434782608695
Validation error for model_4 = 0.23369565217391305



In [19]:
model_index = np.argmin(val_error_list)
models[model_index]

<keras.engine.sequential.Sequential at 0x2569e2858d0>

In [20]:
best_model = models[model_index]
best_model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_3 (Normalizat  (None, 8)                17        
 ion)                                                            
                                                                 
 L1 (Dense)                  (None, 32)                288       
                                                                 
 L2 (Dense)                  (None, 16)                528       
                                                                 
 L3 (Dense)                  (None, 8)                 136       
                                                                 
 L4 (Dense)                  (None, 4)                 36        
                                                                 
 L5 (Dense)                  (None, 12)                60        
                                                           

# Test Error OR Generalization  

In [21]:
yhat = best_model.predict(x_test)
yhat_sigmoid = tf.nn.sigmoid(yhat)
yhat = np.where(yhat_sigmoid >= threshold, 1, 0)

test_error = yhat != y_test
test_error = np.mean(test_error)
print(f'Test error = {test_error}')

Test error = 0.27419354838709675


## Saving Model 

In [22]:
tf.keras.models.save_model(best_model,'model.h5')

## Loading trained model


In [23]:
nn_model = tf.keras.models.load_model('model.h5')

In [None]:
# taking input features to make predictin

print('Enter the following data:')
x_input = []

for i in range(len(features)):
    print(features[i])
    x_i = float(input())
    x_input.append(x_i)
    
x_input = np.array(x_input)
pred = nn_model.predict(x_input)
pred = tf.nn.sigmoid(pred)

Enter the following data:
Pregnancies
0
Glucose


In [25]:
# displaying the predicted result
if pred>threshold:
    print('Sorry! You have a higher risk of developing diabetes based on your medical data.')
else:
    print('Congratulations! You have a low risk of developing diabetes based on your medical data.')

Congratulations! You have a low risk of developing diabetes based on your medical data.
