## Importing modules and Loading dataset.
This section contains importing the important python modules. Also, the dataset to be used, in this case the   



In [1]:
# importing python module.
import pandas as pd
from lightgbm.sklearn import LGBMClassifier
from sklearn.preprocessing import RobustScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
# loading dataset with pandas
dia = pd.read_csv("./dataset/diabetes.csv")

dia.head()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,1,193,77,49,39,19,female,61,119,225,118,70,32,38,84,No diabetes
1,2,146,79,41,36,19,female,60,135,264,108,58,33,40,83,No diabetes
2,3,217,75,54,4,20,female,67,187,293,110,72,40,45,89,No diabetes
3,4,226,97,70,32,20,female,64,114,196,122,64,31,39,79,No diabetes
4,5,164,91,67,24,20,female,70,141,202,122,86,32,39,82,No diabetes


In [3]:
# wrangling datasets with pandas
dia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patient_number   390 non-null    int64 
 1   cholesterol      390 non-null    int64 
 2   glucose          390 non-null    int64 
 3   hdl_chol         390 non-null    int64 
 4   chol_hdl_ratio   390 non-null    object
 5   age              390 non-null    int64 
 6   gender           390 non-null    object
 7   height           390 non-null    int64 
 8   weight           390 non-null    int64 
 9   bmi              390 non-null    object
 10  systolic_bp      390 non-null    int64 
 11  diastolic_bp     390 non-null    int64 
 12  waist            390 non-null    int64 
 13  hip              390 non-null    int64 
 14  waist_hip_ratio  390 non-null    object
 15  diabetes         390 non-null    object
dtypes: int64(11), object(5)
memory usage: 48.9+ KB


In [4]:
dia.describe()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,age,height,weight,systolic_bp,diastolic_bp,waist,hip
count,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0
mean,195.5,207.230769,107.338462,50.266667,46.774359,65.951282,177.407692,137.133333,83.289744,37.869231,42.992308
std,112.727548,44.666005,53.798188,17.279069,16.435911,3.918867,40.407824,22.859528,13.498192,5.760947,5.664342
min,1.0,78.0,48.0,12.0,19.0,52.0,99.0,90.0,48.0,26.0,30.0
25%,98.25,179.0,81.0,38.0,34.0,63.0,150.25,122.0,75.0,33.0,39.0
50%,195.5,203.0,90.0,46.0,44.5,66.0,173.0,136.0,82.0,37.0,42.0
75%,292.75,229.0,107.75,59.0,60.0,69.0,200.0,148.0,90.0,41.0,46.0
max,390.0,443.0,385.0,120.0,92.0,76.0,325.0,250.0,124.0,56.0,64.0


## Wrangling dataset.

In [5]:
dia.chol_hdl_ratio = round(dia.cholesterol / dia.hdl_chol,2)

In [6]:
dia.head()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,1,193,77,49,3.94,19,female,61,119,225,118,70,32,38,84,No diabetes
1,2,146,79,41,3.56,19,female,60,135,264,108,58,33,40,83,No diabetes
2,3,217,75,54,4.02,20,female,67,187,293,110,72,40,45,89,No diabetes
3,4,226,97,70,3.23,20,female,64,114,196,122,64,31,39,79,No diabetes
4,5,164,91,67,2.45,20,female,70,141,202,122,86,32,39,82,No diabetes


In [7]:
dia.waist_hip_ratio= round(dia.waist/dia.hip,2)

In [8]:
dia.head()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,1,193,77,49,3.94,19,female,61,119,225,118,70,32,38,0.84,No diabetes
1,2,146,79,41,3.56,19,female,60,135,264,108,58,33,40,0.82,No diabetes
2,3,217,75,54,4.02,20,female,67,187,293,110,72,40,45,0.89,No diabetes
3,4,226,97,70,3.23,20,female,64,114,196,122,64,31,39,0.79,No diabetes
4,5,164,91,67,2.45,20,female,70,141,202,122,86,32,39,0.82,No diabetes


In [9]:
# creating feature related to weight and height

dia["weight_height"] = round(dia.weight/dia.height, 2)

dia.head()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes,weight_height
0,1,193,77,49,3.94,19,female,61,119,225,118,70,32,38,0.84,No diabetes,1.95
1,2,146,79,41,3.56,19,female,60,135,264,108,58,33,40,0.82,No diabetes,2.25
2,3,217,75,54,4.02,20,female,67,187,293,110,72,40,45,0.89,No diabetes,2.79
3,4,226,97,70,3.23,20,female,64,114,196,122,64,31,39,0.79,No diabetes,1.78
4,5,164,91,67,2.45,20,female,70,141,202,122,86,32,39,0.82,No diabetes,2.01


In [10]:
dia.bmi = pd.to_numeric(dia.bmi.str.replace(",","."))

In [11]:
dia.head()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes,weight_height
0,1,193,77,49,3.94,19,female,61,119,22.5,118,70,32,38,0.84,No diabetes,1.95
1,2,146,79,41,3.56,19,female,60,135,26.4,108,58,33,40,0.82,No diabetes,2.25
2,3,217,75,54,4.02,20,female,67,187,29.3,110,72,40,45,0.89,No diabetes,2.79
3,4,226,97,70,3.23,20,female,64,114,19.6,122,64,31,39,0.79,No diabetes,1.78
4,5,164,91,67,2.45,20,female,70,141,20.2,122,86,32,39,0.82,No diabetes,2.01


In [12]:
dia.weight.describe()

count    390.000000
mean     177.407692
std       40.407824
min       99.000000
25%      150.250000
50%      173.000000
75%      200.000000
max      325.000000
Name: weight, dtype: float64

In [13]:
dia.isnull().sum()

patient_number     0
cholesterol        0
glucose            0
hdl_chol           0
chol_hdl_ratio     0
age                0
gender             0
height             0
weight             0
bmi                0
systolic_bp        0
diastolic_bp       0
waist              0
hip                0
waist_hip_ratio    0
diabetes           0
weight_height      0
dtype: int64

In [14]:
dia.rename(columns={"weight":"height", "height":"weight", "weight_height":"height_weight"}, inplace=True)

In [15]:
dia.head()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,weight,height,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes,height_weight
0,1,193,77,49,3.94,19,female,61,119,22.5,118,70,32,38,0.84,No diabetes,1.95
1,2,146,79,41,3.56,19,female,60,135,26.4,108,58,33,40,0.82,No diabetes,2.25
2,3,217,75,54,4.02,20,female,67,187,29.3,110,72,40,45,0.89,No diabetes,2.79
3,4,226,97,70,3.23,20,female,64,114,19.6,122,64,31,39,0.79,No diabetes,1.78
4,5,164,91,67,2.45,20,female,70,141,20.2,122,86,32,39,0.82,No diabetes,2.01


In [16]:
s= (dia.dtypes == "object")
obj_col= list(s[s].index)

In [17]:
obj_col

['gender', 'diabetes']

In [18]:
orde = OrdinalEncoder()
dia[obj_col] = orde.fit_transform(dia[obj_col])

In [19]:
dia.head()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,weight,height,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes,height_weight
0,1,193,77,49,3.94,19,0.0,61,119,22.5,118,70,32,38,0.84,1.0,1.95
1,2,146,79,41,3.56,19,0.0,60,135,26.4,108,58,33,40,0.82,1.0,2.25
2,3,217,75,54,4.02,20,0.0,67,187,29.3,110,72,40,45,0.89,1.0,2.79
3,4,226,97,70,3.23,20,0.0,64,114,19.6,122,64,31,39,0.79,1.0,1.78
4,5,164,91,67,2.45,20,0.0,70,141,20.2,122,86,32,39,0.82,1.0,2.01


In [20]:
X = dia.drop(["patient_number", "diabetes"], axis= 1)
y= dia.diabetes

In [21]:
scale = RobustScaler()
scaled_x = scale.fit_transform(X, y=y)

In [22]:
scaled_x

array([[-0.2       , -0.48598131,  0.14285714, ..., -0.57142857,
        -0.41025641, -0.91166078],
       [-1.14      , -0.41121495, -0.23809524, ..., -0.28571429,
        -0.61538462, -0.48763251],
       [ 0.28      , -0.56074766,  0.38095238, ...,  0.42857143,
         0.1025641 ,  0.27561837],
       ...,
       [ 1.96      ,  0.        ,  3.42857143, ..., -0.14285714,
        -1.23076923, -0.99646643],
       [ 0.58      ,  3.51401869,  3.23809524, ..., -0.57142857,
         0.41025641, -0.72791519],
       [-0.76      ,  0.14953271,  1.0952381 , ...,  1.28571429,
         1.23076923,  1.27915194]])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size= 0.2, random_state=42)

In [24]:
split = StratifiedShuffleSplit(n_splits=4, random_state=42 )

for train_index, test_index in split.split(scaled_x, y):
    strat_X, strat_test = scaled_x[train_index], scaled_x[test_index]
    strat_y, strat_ytest = y[train_index], y[test_index]

In [25]:
X_train

array([[-0.22      , -1.27102804, -0.19047619, ..., -0.28571429,
        -1.02564103, -0.61484099],
       [-0.44      ,  0.41121495, -0.0952381 , ...,  0.        ,
         0.20512821,  0.07773852],
       [ 0.18      , -0.41121495,  0.14285714, ...,  0.        ,
         0.20512821, -0.51590106],
       ...,
       [-1.48      ,  0.74766355, -0.19047619, ..., -0.57142857,
         0.1025641 , -0.7844523 ],
       [ 0.66      ,  0.78504673,  1.71428571, ..., -0.71428571,
        -1.02564103, -0.91166078],
       [ 2.68      , -0.18691589,  0.76190476, ...,  0.28571429,
        -0.61538462,  0.03533569]])

In [26]:
lgbm_model = LGBMClassifier(n_estimators=200, max_depth=-2, random_state=42)

In [27]:
lgbm_model.fit(X_train, y_train)

LGBMClassifier(max_depth=-2, n_estimators=200, random_state=42)

In [28]:
y_pred=lgbm_model.predict(X_test)

In [29]:
f1_score(y_pred, y_test)

0.9508196721311476

In [30]:
xg_model= XGBClassifier(n_estimators=200, max_depth=4, scale_pos_weight=5.5)

In [31]:
xg_model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=5.5, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [32]:
xg_pred = xg_model.predict(X_test)

In [33]:
f1_score(y_test, xg_pred)

0.9508196721311476

## Stratified Shuffle Test

In [34]:
lgbm_strat = LGBMClassifier(n_estimators=200, learning_rate=0.0099, max_depth=-2, )

In [35]:
lgbm_strat.fit(strat_X, strat_y)

LGBMClassifier(learning_rate=0.0099, max_depth=-2, n_estimators=200)

In [36]:
strat_pred = lgbm_strat.predict(strat_test)

In [37]:
f1_score(strat_pred, strat_ytest)

0.955223880597015

In [38]:
dia.diabetes.value_counts()

1.0    330
0.0     60
Name: diabetes, dtype: int64

In [39]:
330/60

5.5

In [40]:
xgb= XGBClassifier(max_depth=7, n_estimators=1000, scale_pos_weight=5.5)

In [41]:
xgb.fit(strat_X, strat_y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=5.5, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [42]:
y=xgb.predict(strat_test)

In [43]:
f1_score(y, strat_ytest)

0.955223880597015

In [44]:
import sqlite3

In [45]:
conn = sqlite3.connect("diabetes.db")
c = conn.cursor()

In [46]:
col= tuple(dia.columns)
col

('patient_number',
 'cholesterol',
 'glucose',
 'hdl_chol',
 'chol_hdl_ratio',
 'age',
 'gender',
 'weight',
 'height',
 'bmi',
 'systolic_bp',
 'diastolic_bp',
 'waist',
 'hip',
 'waist_hip_ratio',
 'diabetes',
 'height_weight')

In [47]:
conn.commit()

In [48]:
dia.to_sql(name="diabetes.db", con=conn, if_exists= "replace", index=False)

In [49]:
for index in split.split(scaled_x, y):
    X_total, y_total = scaled_x[index], y[index]

ValueError: Found input variables with inconsistent numbers of samples: [390, 39]

In [None]:
y