# DIABETES PREDICTION

## Imports:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC
%matplotlib inline
sns.set()

## Load the Pima Indians Diabetes dataset:

In [None]:
#load the pima indian diabetes dataset
diabetes=pd.read_csv("C:\diabetes.csv")

## Inspect the dataset:

In [None]:
print("diabetes shape is :",diabetes.shape)
print("Dataset Description:\n")
diabetes.describe()

In [None]:
# Visualise a table with the first rows of the dataset, to better understand the data format
print("Dataset head :\n")
diabetes.head()

## Visualize the dataset:

In [None]:
diabetes.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
#print the Outcome counts 1/0
OutCount=diabetes.groupby("Outcome").size()
print(OutCount)
OutCount.plot(kind="bar",title="Outcome Count")

## Data correalation Matrix:

In [None]:
#Finding Correlation of attributes with outcome
corr_mat=diabetes.corr()

In [None]:
#correaltion matrix
corr_mat

### Visualize Predictors Correlation with Outcome:

In [None]:
#plotting a graph for crrelation with Outcome
# here 8 is the index value of Outcome column
res=corr_mat.iloc[:-1,8]
res.plot(kind='barh',title="Correlation Graph")


#### From the above Correlation graph,It can be inferred that factors like "Age,BMI and BloodPressure" which can be measured without taking the blood sample ,ifluence the Outcome(0/1)

# Data Cleaning and Transformation:

#### There are some zero value records in the dataset.

In [None]:
zeros_Age=(diabetes["Age"]==0).sum()
zeros_BMI=(diabetes["BMI"]==0).sum()
zeros_BP=(diabetes["BloodPressure"]==0).sum()
print("Count of Zero values in Age : ",zeros_Age)
print("Count of Zero values in BMI : ",zeros_BMI)
print("Count of Zero values in BP : ",zeros_BP)

#### Remove these records (zero value) from the dataset and create the required dataset for the model prediction.

## Creating Dataset for model:

In [None]:
#temp_ds contains all non zero records of the diabetes dataset
temp_ds=pd.DataFrame(diabetes[(diabetes["Age"]>0) & (diabetes["BMI"]>0) & (diabetes["BloodPressure"]>0)])
main_dataset=pd.DataFrame(data=temp_ds,columns=["Age","BMI","BloodPressure","Outcome"])
print("Original dataset dimesnions(diabetes): ",diabetes.shape)
print("Original dataset without zero value records dimensions(temp_ds): ",temp_ds.shape)
print("Dataset for Model without zero value records dimensions(main_dataset): ",main_dataset.shape)

In [None]:
main_dataset.describe()

#### main_dataset contains 729 non zero records.

In [None]:
out_count=main_dataset.groupby("Outcome").size()
print(out_count)
out_count.plot(kind="bar",title="Outcome label Count in main Dataset")

# Splitting the Dataset:

In [None]:
#feature matrix
X=main_dataset.iloc[:,:-1]
X.head()
#X_train,X_test,y_train,y_test=train_test_split(main_dataset,random_state=66)

In [None]:
#value vector
y=main_dataset["Outcome"]
y.head()

In [None]:
# Split the training dataset in 80% / 20%
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=60,stratify=y)
print("X_train shape : ",X_train.shape)
print("y_train shape : ",y_train.shape)
print("X_test shape : ",X_test.shape)
print("y_test shape : ",y_test.shape)

## Feature Scaling:

In [None]:
scaler=MinMaxScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.fit_transform(X_test)
print("type(X_train_scaled) : ",type(X_train_scaled))
print("type(X_test_scaled) : ",type(X_test_scaled))

### Scaled Values:

In [None]:
#create a pandas dataframe to display the scaled values
sv=pd.DataFrame(data=X_train_scaled)
sv.head()

# Training the SVM model:

In [None]:
#create instance for SVC
svc=SVC()
svc.fit(X_train_scaled,y_train)

### Check Accuracy:

In [None]:
# use score of SVC() to find Accuracy
train_accuracy=svc.score(X_train_scaled,y_train)
test_accuracy=svc.score(X_test_scaled,y_test)
print("Accuracy on training set: ",train_accuracy)
print("Accuracy on testing set: ",test_accuracy)

## Model Tuning:

### Find the best Parameters for SVC.

In [None]:
param_grid = {
    'C': [1.0, 10.0, 50.0],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'shrinking': [True, False],
    'gamma': ['auto', 1, 0.1],
    'coef0': [0.0, 0.1, 0.5]
}

model_svc = SVC()

grid_search = GridSearchCV(model_svc, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

#### Print the best score found by GridSearchCV:

In [None]:
best_score=grid_search.best_score_
print("Best score = ",best_score)

#### Apply the Parameters to the Model and train it:

In [None]:
# Create an instance of the SVC algorithm using parameters
# from best_estimator_ property
best_svc = grid_search.best_estimator_
#train the model
best_svc.fit(X_train_scaled,y_train)

### Check Accuracy:

In [None]:
# use score of SVC() to find Accuracy
best_train_accuracy=best_svc.score(X_train_scaled,y_train)
best_test_accuracy=best_svc.score(X_test_scaled,y_test)
print("Best Accuracy on training set: ",best_train_accuracy)
print("Best Accuracy on testing set: ",best_test_accuracy)

## Make a Prediction:

In [None]:
# # create a new (fake) person by taking the values of Age,BMI and BloodPressure
# new_person = pd.DataFrame([[50,33.6,72]])
# # Scale those values like the others using MinMaxScaler
# new_person_scaled = scaler.transform(new_person)

In [None]:
# #predict the outcome
# #here "1" means "person is likely to have type-2 diabetes"
# # 0 means "person doesn't have type-2 diabetes
# prediction = best_svc.predict(new_person_scaled)
# type(prediction)

In [None]:
# print("Prediction value : ",prediction[0])

In [None]:
# if(prediction==1):
#     print("You are likely to have type-2 diabetes.")
# else:
#     print("Congratulations, You don't have type-2 diabetes.")

### Taking the input from user:

In [None]:
name=input("Enter Name : ")
age = (int)(input("Ager : "))
BMI = (float)(input("BMI : "))
BP = (int)(input("Blood Pressure : "))
person = pd.DataFrame([[age,BMI,BP]])
person_scaled = scaler.transform(person)
P_prediction = best_svc.predict(person_scaled)
if(P_prediction==1):
    print(name," are likely to have type-2 diabetes.")
else:
    print("Congratulations ",name,",You don't have type-2 diabetes.")