In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('debete.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
df.shape

(768, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

##### From our above observation,there is no missing values in this dataset but features like glucose,blood pressure,skinthickness,insulin cant have zero values.So we will have to fill it with mean or median values of specific features. So we will replace zero values with NAN so can count number of zero values.We will now create a copy of our datasets

In [6]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [7]:
df_copy = df.copy(deep=True)

In [8]:
df_copy[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI']]

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI
0,148,72,35,0,33.6
1,85,66,29,0,26.6
2,183,64,0,0,23.3
3,89,66,23,94,28.1
4,137,40,35,168,43.1
...,...,...,...,...,...
763,101,76,48,180,32.9
764,122,70,27,0,36.8
765,121,72,23,112,26.2
766,126,60,0,0,30.1


#### Now lets replace zero values in these columns with NAN

In [9]:
df_copy[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI']] = df_copy[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI']].replace(0,np.nan)

In [10]:
df_copy.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

##### As you can see,we are having 5 zeros for glucose,35,227,374,11 for BloodPressure, SkinThickness,Insulin,and BMI respectively.So let's replace them with mean values in our normal datasets not the copy datasets.
       

In [11]:
df['Glucose'] = df['Glucose'].replace(0,df['Glucose'].mean())
df['BloodPressure'] = df['BloodPressure'].replace(0,df['BloodPressure'].mean())
df['SkinThickness'] = df['SkinThickness'].replace(0,df['SkinThickness'].mean())
df['Insulin'] = df['Insulin'].replace(0,df['Insulin'].mean())
df['BMI'] = df['BMI'].replace(0,df['BMI'].mean())

##### Now we have successfully replaced zeros with mean values for our copied values

#### Let's Store Our Variables(Independent and Dependent) into Matrix X and vector y respectively

In [12]:
X = df.drop('Outcome',axis = 1)
y = df['Outcome']

#### Split our Datasets into train_test_split

In [13]:
from sklearn.model_selection import train_test_split

##### Remember we will train our datasets on X_train and y_train then perform prediction using X_test(Our unseen Samples) and we will compare predicted reults by our models with y_test

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#### We will create a Machine learning pipeline Using Scikit-Learn
##### What does pipeline do? It trains together output of each steps used in the next steps by putting together a series of steps.It can be used to automate a machine learning workflow.It can involve Preprocessing,Feature Selection,Feature Scaling,Classification or Regression, and Post Processing

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline


##### Our Modules are successfully imported,let's create a pipeline using sklearn.Remember our output of this step is essntial for input of the next step.The machine learning algorithms that require feature scaling include:KNN,NN,SVM,linear and logistic Regression

In [16]:
pipeline_lr = Pipeline([('scalar1',StandardScaler()),
                        ('lr_classifier',LogisticRegression())])
pipeline_knn = Pipeline([('scalar2',StandardScaler()),
                        ('knn_classifier',KNeighborsClassifier())])
pipeline_svc = Pipeline([('scalar3',StandardScaler()),
                        ('svc_classifier',SVC())])
pipeline_dt = Pipeline([('dt_classifier',DecisionTreeClassifier())])
pipeline_rf = Pipeline([('rf_classifier',RandomForestClassifier(max_depth=3))])
pipeline_gbc = Pipeline([('gbc_classifier',GradientBoostingClassifier())])
                                             

In [17]:
pipelines = [pipeline_lr,
             pipeline_knn,
             pipeline_svc,
             pipeline_dt,
             pipeline_rf,
             pipeline_gbc]
                         

In [18]:
pipelines

[Pipeline(steps=[('scalar1', StandardScaler()),
                 ('lr_classifier', LogisticRegression())]),
 Pipeline(steps=[('scalar2', StandardScaler()),
                 ('knn_classifier', KNeighborsClassifier())]),
 Pipeline(steps=[('scalar3', StandardScaler()), ('svc_classifier', SVC())]),
 Pipeline(steps=[('dt_classifier', DecisionTreeClassifier())]),
 Pipeline(steps=[('rf_classifier', RandomForestClassifier(max_depth=3))]),
 Pipeline(steps=[('gbc_classifier', GradientBoostingClassifier())])]

In [19]:
for pipe in  pipelines:
    pipe.fit(X_train,y_train)

##### Let's Create a Dictionary for our pipeline

In [20]:
pipe_dict = {0:'LR',
             1:'KNN',
             2:'SVC',
             3:'DT',
             4:'RF',
             5:'GBC'}

In [21]:
pipe_dict

{0: 'LR', 1: 'KNN', 2: 'SVC', 3: 'DT', 4: 'RF', 5: 'GBC'}

In [22]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy:{}".format(pipe_dict[i],model.score(X_test,y_test)*100))

LR Test Accuracy:76.62337662337663
KNN Test Accuracy:76.62337662337663
SVC Test Accuracy:73.37662337662337
DT Test Accuracy:72.72727272727273
RF Test Accuracy:79.22077922077922
GBC Test Accuracy:75.97402597402598


##### Let's train our datasets on Random Forest Classifier because it's our best model

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
X = df.drop('Outcome',axis = 1)
y = df['Outcome']

In [25]:
rf=RandomForestClassifier(max_depth=3)

In [26]:
rf.fit(X,y)

RandomForestClassifier(max_depth=3)

In [27]:
new_data = pd.DataFrame({
    'Pregnancies':6,
    'Glucose':148.0,
    'BloodPressure':72.0,
    'SkinThickness':35.0,
    'Insulin':79.799479,
    'BMI':33.6,
    'DiabetesPedigreeFunction':0.627,
    'Age':50,
},index = [0])


In [28]:
df2=new_data.astype('float')

##### The prediction below will give us a return outcome of 1, meaning the patient is diabetic.But we will need put it in a proper format

In [29]:
Pr_=rf.predict(new_data)

In [30]:
if Pr_==0:
    print('non-diabetic')
else:
    print('Diabetic')

Diabetic


#### Save The Model Using Joblib

In [31]:
import joblib

In [32]:
joblib.dump(rf,'model_joblib_diabetics')

['model_joblib_diabetics']

In [33]:
model=joblib.load('model_joblib_diabetics')

##### We will try our saved Joblib model on our new_data

In [34]:
model.predict(df2)

array([1], dtype=int64)

##### GUI

In [35]:
from tkinter import*
import joblib
def show_entry_fields():
    p1=int(e1.get())
    p2=int(e2.get())
    p3=int(e3.get())
    p4=int(e4.get())
    p5=int(e5.get())
    p6=float(e6.get())
    p7=float(e7.get())
    p8=int(e8.get())
    
    model = joblib.load('model_joblib_diabetics')
    result=model.predict([[p1,p2,p3,p4,p5,p6,p7,p8]])
    if result==0:
        Label(master,text="No Chance of Diabetics").grid(row=31)
    else:
        Label(master,text="Possibility of Diabetics").grid(row=31)
master=Tk()
master.title("Heart Disease Prediction System")

label = Label(master,text="Heart Disease Prediction System"
                         ,bg = "black",fg="red").\
                             grid(row=0,columnspan=2)


Label(master,text="Pregnancies").grid(row=1)
Label(master,text="Glucose").grid(row=2)
Label(master,text="BloodPressure").grid(row=3)
Label(master,text="SkinThickness").grid(row=4)
Label(master,text="Insulin").grid(row=5)
Label(master,text="BMI").grid(row=6)
Label(master,text="DiabetesPedigreeFunction").grid(row=7)
Label(master,text="Age").grid(row=8)



e1=Entry(master)
e2=Entry(master)
e3=Entry(master)
e4=Entry(master)
e5=Entry(master)
e6=Entry(master)
e7=Entry(master)
e8=Entry(master)



e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)

Button(master,text="Predict",command=show_entry_fields).grid()


mainloop()
