# Diabetes Prediction with Deep Learning

### Machine Learning step by step:
   1. Business & Data Understanding
   2. Data Cleaning & Analysis
   3. Data Modeling
   4. Model Evaluation
   5. Model Deployment & Maintenance

## Import necessary libraries

#### Base Libraries

In [3]:
import pandas as pd
import numpy as np 
import os
from scipy import stats
from scipy.stats import skew

#### Visualization

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

#### Preprocessing

In [5]:
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#### Algorithms

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import Callback

ImportError: cannot import name 'defun_with_attributes' from 'tensorflow.python.eager.function' (C:\Users\ejhas\anaconda3\lib\site-packages\tensorflow\python\eager\function.py)

### Plot Style using Seaborn

In [None]:
plt.style.use('seaborn')

## 1. Business & Data Understanding

### This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. 
The objective of this notebook is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. 
Several constraints were placed on the selection of these instances from a larger database. 
In particular, all patients here are females at least 21 years old of Pima Indian heritage.

### Load dataset

In [None]:
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness','Insulin','BMI','PedigreeFunction','Age','y']
df_diabetes = pd.read_csv("C:/Users/ejhas/OneDrive/Documents/miracles/JOBS/AI Bootcamp/WEEK6/day1/Dataset/pima-indians-diabetes.csv", header=None, names=column_names)

In [None]:
df_diabetes.info()

In [None]:
print('Total Rows : ', df_diabetes.shape[0])
print('Total Columns/Features : ', df_diabetes.shape[1])

In [None]:
df_diabetes

In [None]:
df_diabetes.hist(figsize=(9, 9))

In [None]:
for x in df_diabetes.columns:
    sns.displot(df_diabetes[x],kde=True)
plt.show()

## 2. Data Cleaning & Analysis

#### Missing or Null Data points

In [None]:
# Missing value percentage checking
missing_percentage = (df_diabetes.isnull().sum() / len(df_diabetes)) * 100

# Show the missing percentage
print(missing_percentage)

#### Unexpected Outliers Identified from Histogram
There are three features that could probably have outliers, such as Glucose, Blood Pressure, BMI, Insulin, Skin Thickness, Pedigree Function

#### Identify the columns with the value "0" by creating a boolean mask

In [None]:
# Exclude the "y" column
columns_to_check = df_diabetes.columns.difference(["y"])

# Create a boolean mask for rows where the value is "0"
mask = (df_diabetes[columns_to_check] == 0)

#### Summarize the results to see which columns have the value "0" in them

In [None]:
# Summarize the boolean mask to check which columns have at least one "0" value
columns_with_zeros = mask.any()

# List the column names with "0" values
columns_with_zeros = columns_with_zeros[columns_with_zeros].index

print("Columns with '0' values:")
print(columns_with_zeros)

#### Counts the number of zero values

In [None]:
# Exclude the 'y' column from the selection
numeric_columns = df_diabetes.select_dtypes(include=['number']).drop(columns=['y'])

# Count the number of zeros in each selected column
zero_counts = numeric_columns.eq(0).sum()

# Print the columns with zero counts
print("Columns with zero values and their counts (excluding 'y'):")
print(zero_counts[zero_counts > 0])

#### Target Value Counts

In [None]:
df_diabetes['y'].value_counts()

In [None]:
sns.countplot(x='y', data=df_diabetes)
plt.show()

In [None]:
#df_diabetes['y'].value_counts().plot(kind='pie')
plt.pie(df_diabetes['y'].value_counts().tolist(), labels=['0', '1'], autopct='%1.1f%%')

plt.title("Outcomes")
plt.tight_layout()

plt.show()

#### Correlation

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
corr = df_diabetes.corr()
sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            ax=ax,
            mask=np.triu(corr, k=0),
            cmap='viridis',
            vmax=1.0, 
            vmin=-1.0, 
            linewidths=1.0,
            annot=True,
            annot_kws={'fontsize': 20},
            square=True,
            fmt='.3f')

plt.show()

In [None]:
df_diabetes.corr()['y']

##### Diabetes Percentage by Age Range

In [None]:
# Define Age ranges
age_bins = [20, 30, 40, 50, 60, 70, 80]  # Adjust the age ranges as needed
age_labels = ['20-29', '30-39', '40-49', '50-59', '60-69', '70+']

#Use the pd.cut function to categorize ages into age ranges:
df_diabetes['AgeRange'] = pd.cut(df_diabetes['Age'], bins=age_bins, labels=age_labels, include_lowest=True)

#Group the data by age range and calculate the percentage of diabetes cases (Diabetes=1):
age_diabetes = df_diabetes.groupby('AgeRange')['y'].mean() * 100

#Plot the data:
plt.bar(age_diabetes.index, age_diabetes)
plt.xlabel("Age Range")
plt.ylabel("Percentage with Diabetes")
plt.title("Diabetes Percentage by Age Range")
plt.show()

#### Check BMI distribution

In [None]:
lower_limit = 21
upper_limit = 80
#Filter the DataFrame to select only the rows within your specified range:

filtered_df = df_diabetes[(df_diabetes['BMI'] >= lower_limit) & (df_diabetes['BMI'] <= upper_limit)]

plt.hist(filtered_df['BMI'], bins=50, edgecolor='k')  # You can adjust the number of bins.
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.title(f'Histogram of BMI Values in Range {lower_limit}-{upper_limit}')
plt.grid(True)
plt.show()

##### BMI vs. Diabetes

In [None]:
# Define BMI ranges
bmi_ranges = [0, 18.5, 24.9, 29.9, 34.9, 100]
bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obese Class I', 'Obese Class II+']

#Use the pd.cut function to categorize ages into age ranges:
df_diabetes['BMIRange'] = pd.cut(df_diabetes['BMI'], bins=bmi_ranges, labels=bmi_labels, include_lowest=True)

#Group the data by age range and calculate the percentage of diabetes cases (Diabetes=1):
bmi_diabetes = df_diabetes.groupby('BMIRange')['y'].mean() * 100

#Plot the data:
plt.bar(bmi_diabetes.index, bmi_diabetes)
plt.xlabel("BMI Range")
plt.ylabel("Percentage with Diabetes")
plt.title("Diabetes Percentage by BMI Range")
plt.show()

##### Glucose vs Diabetes

In [None]:
#Create two separate data series for diabetic and non-diabetic patients:
glucose_diabetic = df_diabetes[df_diabetes['y'] == 1]['Glucose']

glucose_non_diabetic = df_diabetes[df_diabetes['y'] == 0]['Glucose']
# Define the bins (ranges) for the histogram
bins = [50, 70, 100, 125, 150, 200, 250]

# Plot histogram for diabetic patients
plt.hist(glucose_diabetic, bins=bins, alpha=0.5, label='Diabetic', color='red')

# Plot histogram for non-diabetic patients
plt.hist(glucose_non_diabetic, bins=bins, alpha=0.5, label='Non-Diabetic', color='blue')

# Labeling and legend
plt.xlabel('Glucose Level')
plt.ylabel('Frequency')
plt.title('Glucose Distribution by Diabetes Status')
plt.legend()

# Show the plot
plt.show()

#### Drop zero values
Since we have 2 columns with too much zero values, we then need to drop:
1. SkinThickness    227
2. Insulin          374

In [None]:
columns_to_drop = ['SkinThickness', 'Insulin', 'AgeRange', 'BMIRange']
df_new = df_diabetes.drop(columns_to_drop, axis=1)

In [None]:
df_new.head()

#### Treating the rest of columns with zero values
Glucose has normal distributions, we replace 0 values in those columns by mean value. BMI and Blood Pressure have negative skewed distributions, median will be used to replace them.
1. Pregnancies      111
2. Glucose            5
3. BloodPressure     35
4. BMI               11

In [None]:
# Calculate the skewness for all numerical columns
skewness = df_new.skew()

# Print the skewness values
print(skewness)

In [None]:
# Calculate mean and median (excluding zeros)
mean = df_new[df_new['Glucose'] != 0]['Glucose'].mean()
median_bmi = df_new[df_new['BMI'] != 0]['BMI'].median()
median_bp = df_new[df_new['BloodPressure'] != 0]['BloodPressure'].median()

In [None]:
# Replace zeros on Glucose with mean
df_new['Glucose'].replace(0, mean, inplace=True)

# Replace zeros on BMI with median
df_new['BMI'].replace(0, median_bmi, inplace=True)

# Replace zeros on BloodPressure with median
df_new['BloodPressure'].replace(0, median_bp, inplace=True)

In [None]:
df_new.eq(0).sum()

## 3. Data Modelling


### Split Datasets

In [None]:
df_new.head()

In [None]:
features = df_new.values[:,:6]
print(features.shape)

In [None]:
labels = df_new.values[:,6:]
print(labels.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, 
                                                    test_size=0.2, 
                                                    random_state=77)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### Define Model

In [None]:
# define the keras sequential model
model = Sequential()
#define the neural network
model.add(Dense(20, input_dim=6, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(16, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
#summary of model
model.summary()

### Compile Model

In [None]:
#prepare the backpropagation
model.compile(loss='binary_crossentropy', 
              optimizer=Adam(learning_rate=0.0001),
              metrics=['accuracy']
             )

In [None]:
#make a callback
class CustomCallback(Callback):
    def on_epoch_end(self, epochs, logs={}):
        if(logs.get('val_accuracy')>0.80):
            print('\nReeached 80%, cancel training')
            self.model.stop_training = True

In [None]:
# Instantiate the custom callback
callback = CustomCallback()

### Train Model

In [None]:
# fit the keras model on the dataset
model.fit(X_train, y_train, 
          epochs=140, batch_size=7, 
          validation_data=(X_test, y_test),
          callbacks=[callback]
          )

### Evaluate Model

In [None]:
#create a dataframe
model_hist = pd.DataFrame(model.history.history)
model_hist.head()

In [None]:
#accuracy vs. val_accuracy plot 
plt.figure(figsize=[8,5])
plt.plot(model_hist['accuracy'], 'r', label='Training Acc')
plt.plot(model_hist['val_accuracy'], 'b', label='Validation Acc')
plt.legend()
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Accuracy', fontsize=16)
plt.title('Accuracy Curves', fontsize=16)
plt.show()

In [None]:
# evaluate the keras model in train dataset
_, accuracy = model.evaluate(X_train, y_train, verbose=False)
print(f'Accuracy: {accuracy*100:.2f}')

In [None]:
# evaluate the keras model in test dateset
_, accuracy = model.evaluate(X_test, y_test, verbose=False)
print(f'Accuracy: {accuracy*100:.2f}')

In [None]:
#loss vs. val_loss plot 
plt.figure(figsize=[8,5])
plt.plot(model_hist['loss'], 'r', label='Training Loss')
plt.plot(model_hist['val_loss'], 'b', label='Validation Loss')
plt.legend()
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Loss', fontsize=16)
plt.title('Loss Curves', fontsize=16)
plt.show()