<a href="https://colab.research.google.com/github/gunelatakishyeva/MachineLearningProjects/blob/main/diabetes_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('/content/diabetes_prediction_dataset.csv')
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [4]:
df=df.drop('age',axis=1)

In [5]:
X=df.drop('diabetes',axis=1)
y=df['diabetes'].copy()

In [6]:
X

Unnamed: 0,gender,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,Female,0,1,never,25.19,6.6,140
1,Female,0,0,No Info,27.32,6.6,80
2,Male,0,0,never,27.32,5.7,158
3,Female,0,0,current,23.45,5.0,155
4,Male,1,1,current,20.14,4.8,155
...,...,...,...,...,...,...,...
99995,Female,0,0,No Info,27.32,6.2,90
99996,Female,0,0,No Info,17.37,6.5,100
99997,Male,0,0,former,27.83,5.7,155
99998,Female,0,0,never,35.42,4.0,100


In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
cat_features=X_train.select_dtypes(exclude=[np.number]).columns
num_features=X_train.select_dtypes(include=[np.number]).columns

In [9]:
cat_features

Index(['gender', 'smoking_history'], dtype='object')

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

In [11]:
cat_pipeline=make_pipeline(
    (SimpleImputer(strategy='most_frequent')),
    (OneHotEncoder(handle_unknown='ignore'))
)

num_pipeline=make_pipeline(
    StandardScaler()
)

transformer=ColumnTransformer([
    ('cat_pipeline',cat_pipeline,cat_features),
    ('num_pipeline',num_pipeline,num_features)
])

In [12]:
X_transformed_train=transformer.fit_transform(X_train)
transformed_columns=transformer.get_feature_names_out()

In [13]:
transformed_columns

array(['cat_pipeline__gender_Female', 'cat_pipeline__gender_Male',
       'cat_pipeline__gender_Other',
       'cat_pipeline__smoking_history_No Info',
       'cat_pipeline__smoking_history_current',
       'cat_pipeline__smoking_history_ever',
       'cat_pipeline__smoking_history_former',
       'cat_pipeline__smoking_history_never',
       'cat_pipeline__smoking_history_not current',
       'num_pipeline__hypertension', 'num_pipeline__heart_disease',
       'num_pipeline__bmi', 'num_pipeline__HbA1c_level',
       'num_pipeline__blood_glucose_level'], dtype=object)

In [14]:
X_transformed_test=transformer.transform(X_test)

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
dt_clf=DecisionTreeClassifier(max_depth=3)

In [17]:
dt_clf.fit(X_transformed_train,y_train)

In [18]:
dt_clf.score(X_transformed_test,y_test)

0.97215

In [19]:
dt_clf.score(X_transformed_train,y_train)

0.9718

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf_clf=RandomForestClassifier(max_depth=15)

In [21]:
rf_clf.fit(X_transformed_train,y_train)

In [22]:
rf_clf.score(X_transformed_test,y_test)

0.9724

In [23]:
rf_clf.score(X_transformed_train,y_train)

0.9762375

In [24]:
from sklearn.svm import SVC
svc=SVC()
svc.fit(X_transformed_train,y_train)

In [25]:
svc.score(X_transformed_test,y_test)

0.96165

In [26]:
svc.score(X_transformed_train,y_train)

0.96385

In [27]:
from sklearn.model_selection import cross_val_predict
y_pred_train=cross_val_predict(dt_clf,X_transformed_train,y_train,cv=5)

In [28]:
from sklearn.metrics import f1_score

In [29]:
f1_score(y_train,y_pred_train)

0.8008474576271187

In [36]:
from sklearn.ensemble import VotingClassifier
voting_clf=VotingClassifier(estimators=[
    ('dt_clf',dt_clf),
    ('rf_clf',rf_clf),
    ('svc',SVC(probability=True))],
    voting='soft')

In [37]:
voting_clf.fit(X_transformed_train,y_train)

In [38]:
voting_clf.score(X_transformed_test,y_test)

0.9722

In [39]:

voting_clf.score(X_transformed_train,y_train)

0.972925