# Diabetes Prediction

Link to the Dataset:
[Pima Indians Diabetes Database](https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database)

#About Dataset:
##`Context`
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

##`Content`
The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

In [1]:
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'pandas'

In [None]:
# set seed for reproducibility
SEED = 20
np.random.seed(SEED)

In [None]:
# Loading Data
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
df.head()

# Exploratory Data Analysis

In [None]:
# checking null values
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Check for zero values in all columns
zero_values = (df == 0).sum()

# Display the columns with zero values
print("Columns with zero values:")
print(zero_values[zero_values > 0])


We have many zero values will definetely effect the model accuracy. We need to convert them with `nan` values.

Function to replace zero values with `nan`

In [None]:
def replace_zero(df):
    df_nan=df.copy(deep=True)
    cols = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
    df_nan[cols] = df_nan[cols].replace({0:np.nan})
    return df_nan
df_nan=replace_zero(df)

In [None]:
def find_median(frame,var):
    temp = frame[frame[var].notnull()]
    temp = frame[[var,'Outcome']].groupby('Outcome')[[var]].median().reset_index()
    return temp

In [None]:
# function to replace null values
def replace_null(frame,var):
    median_df=find_median(frame,var)
    var_0=median_df[var].iloc[0]
    var_1=median_df[var].iloc[1]
    frame.loc[(frame['Outcome'] == 0) & (frame[var].isnull()), var] = var_0
    frame.loc[(frame['Outcome'] == 1) & (frame[var].isnull()), var] = var_1
    return frame[var].isnull().sum()

In [None]:
print(str(replace_null(df_nan,'Glucose'))+ ' Nulls for Glucose')
print(str(replace_null(df_nan,'SkinThickness'))+ ' Nulls for SkinThickness')
print(str(replace_null(df_nan,'Insulin'))+ ' Nulls for Insulin')
print(str(replace_null(df_nan,'BMI'))+ ' Nulls for BMI')
print(str(replace_null(df_nan,'BloodPressure'))+ ' Nulls for BloodPressure')
# We have successfully handled Nulls

All null values has been successfully imputed with their median.


### Data Scaling

In [None]:
# We need to scale our data for uniformity.
from sklearn.preprocessing import StandardScaler
def std_scalar(df):
    std_X = StandardScaler()
    x =  pd.DataFrame(std_X.fit_transform(df.drop(["Outcome"],axis = 1),),
            columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
           'BMI', 'DiabetesPedigreeFunction', 'Age'])
    y=df["Outcome"]
    return x,y

### Data After Scaling

In [None]:
X,Y=std_scalar(df_nan)
X.describe()

In [None]:
std_x = StandardScaler()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=20, stratify=Y)


Let's implement KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
test_score = []
train_score = []
for i in range(5,15):
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(X_train, y_train)
    train_score.append(neigh.score(X_train,y_train))
    test_score.append(neigh.score(X_test,y_test))

In [None]:
print('Max train_scores is ' + str(max(train_score)*100) + ' for k = '+
      str(train_score.index(max(train_score))+5))

In [None]:
print('Max test_scores is ' + str(max(test_score)*100) + "k ="+
      str(test_score.index(max(test_score))+5))

###Logistic regression

In [None]:
# Lets try Logistic regression now
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(random_state=20, penalty='l2').fit(X_train, y_train)
log_pred=log_model.predict(X_test)
log_model.score(X_test, y_test)

###SVC

In [None]:
from sklearn import svm
svm_model = svm.SVC().fit(X_train, y_train)
svm_pred=svm_model.predict(X_test)
svm_model.score(X_test, y_test)

Model Accuracy Confirmation

In [None]:
def model_pref(pred, y_test):
  cmp = []
  for i, j in zip(pred, y_test):
    if i == j:
      cmp.append(1)
    else:
      cmp.append(0)
  return cmp

In [None]:
cmp =model_pref(svm_pred, y_test)

In [None]:
print("Model Accuracy Confirmation: " +str(cmp.count(1)/len(y_test)))

###RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(max_depth=2, random_state=20).fit(X_train, y_train)
rf_pred=rf_model.predict(X_test)
rf_model.score(X_test, y_test)

###Training Deep neural network

In [None]:
import tensorflow as tf
def build_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Dense(8, activation = 'relu', input_shape=[len(X_train.keys())]),
      tf.keras.layers.Dense(4, activation = 'relu'),
      tf.keras.layers.Dense(2, activation = 'relu'),
      tf.keras.layers.Dense(1, activation = 'sigmoid')

  ])


  optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
  model.compile(loss= 'binary_crossentropy', optimizer = optimizer, metrics = ['accuracy'])
  return model
neural_model = build_model()

In [None]:
neural_model.summary()

Fit Neural model on dataset.

In [None]:
neural_pred = neural_model.fit(X_train, y_train, validation_split=0.1, verbose=2, epochs=1000)

In [None]:
# Lets measure final performance
hist = pd.DataFrame(neural_pred.history)
hist['epoch'] = neural_pred.epoch
hist.tail()

In [None]:
neural_test = neural_model.predict(X_test)

In [None]:
neural_test_converted=[]
for i in neural_test:
    if i>0.5:
        neural_test_converted.append(1)
    else:
        neural_test_converted.append(0)

In [None]:
cmp = model_pref(neural_test_converted, y_test)

In [None]:
print("Test Accuracy: ",str(round(cmp.count(1)/ len(y_test)*100,2))+"%")

SVM is Good to go

Save the model

In [None]:
import pickle
pickle.dump(svm_model, open("svm_model.pkl", 'wb') )