In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gestational-diabetes/Gestational Diabetes.csv


In [2]:
data = pd.read_csv("/kaggle/input/gestational-diabetes/Gestational Diabetes.csv")
data.head()

Unnamed: 0,Age,Pregnancy No,Weight,Height,BMI,Heredity,Prediction
0,17.0,1.0,48.0,165.0,17.6,0,0
1,17.0,1.0,49.0,145.0,23.3,0,0
2,17.0,1.0,50.0,140.0,25.5,0,0
3,17.0,1.0,50.0,145.0,23.8,0,0
4,17.0,1.0,49.0,146.0,23.0,0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1012 entries, 0 to 1011
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           1012 non-null   float64
 1   Pregnancy No  1012 non-null   float64
 2   Weight        1012 non-null   float64
 3   Height        1012 non-null   float64
 4   BMI           1012 non-null   float64
 5   Heredity      1012 non-null   int64  
 6   Prediction    1012 non-null   int64  
dtypes: float64(5), int64(2)
memory usage: 55.5 KB


In [4]:
print(data.isnull().sum())
print(data.duplicated().sum())

Age             0
Pregnancy No    0
Weight          0
Height          0
BMI             0
Heredity        0
Prediction      0
dtype: int64
94


In [5]:
data.drop_duplicates(inplace=True)

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

X = data.drop('Prediction', axis=1)
y = data['Prediction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Extra Trees': ExtraTreesClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0),
    'LightGBM': LGBMClassifier(random_state=42, verbose=0)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    print(f'{name}')
    print(f'Accuracy: {accuracy}')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('Classification Report:')
    print(class_report)
    print('-------------------------------')

Random Forest
Accuracy: 0.8641304347826086
Confusion Matrix:
[[142   2]
 [ 23  17]]
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       144
           1       0.89      0.42      0.58        40

    accuracy                           0.86       184
   macro avg       0.88      0.71      0.75       184
weighted avg       0.87      0.86      0.84       184

-------------------------------
Gradient Boosting
Accuracy: 0.8858695652173914
Confusion Matrix:
[[142   2]
 [ 19  21]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       144
           1       0.91      0.53      0.67        40

    accuracy                           0.89       184
   macro avg       0.90      0.76      0.80       184
weighted avg       0.89      0.89      0.87       184

-------------------------------
AdaBoost
Accuracy: 0.8369565217391305
Confusion Matrix:
[[136   

In [7]:
from sklearn.model_selection import GridSearchCV

#random forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [1,2,3,5,7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

best_rfc = grid_search.best_estimator_
best_rfc.fit(X_train, y_train)
y_pred_optimized = best_rfc.predict(X_test)

accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
conf_matrix_optimized = confusion_matrix(y_test, y_pred_optimized)
class_report_optimized = classification_report(y_test, y_pred_optimized)

print(f'Optimized Accuracy: {accuracy_optimized}')
print('Optimized Confusion Matrix:')
print(conf_matrix_optimized)
print('Optimized Classification Report:')
print(class_report_optimized)

Optimized Accuracy: 0.8804347826086957
Optimized Confusion Matrix:
[[143   1]
 [ 21  19]]
Optimized Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.99      0.93       144
           1       0.95      0.47      0.63        40

    accuracy                           0.88       184
   macro avg       0.91      0.73      0.78       184
weighted avg       0.89      0.88      0.86       184



In [8]:
#gradient boosting
gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [1,2,3,5,7],
    'min_samples_split': [2, 5, 7],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.5, 0.8, 1.0]
}

grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_grid=gb_params, cv=3, n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

best_gb = grid_search.best_estimator_
best_gb.fit(X_train, y_train)
y_pred_optimized = best_gb.predict(X_test)

accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
conf_matrix_optimized = confusion_matrix(y_test, y_pred_optimized)
class_report_optimized = classification_report(y_test, y_pred_optimized)

print(f'Optimized Accuracy: {accuracy_optimized}')
print('Optimized Confusion Matrix:')
print(conf_matrix_optimized)
print('Optimized Classification Report:')
print(class_report_optimized)

Optimized Accuracy: 0.8641304347826086
Optimized Confusion Matrix:
[[141   3]
 [ 22  18]]
Optimized Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       144
           1       0.86      0.45      0.59        40

    accuracy                           0.86       184
   macro avg       0.86      0.71      0.75       184
weighted avg       0.86      0.86      0.85       184



In [13]:
#catboost
cb_params = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [1,2,3, 5, 7],
    'l2_leaf_reg': [1, 3, 5, 7],
    'subsample': [0.5, 0.8, 1.0],
}

grid_search = GridSearchCV(estimator=CatBoostClassifier(random_state=42, verbose=0), param_grid=cb_params, cv=3, n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

best_cb = grid_search.best_estimator_
best_cb.fit(X_train, y_train)
y_pred_optimized = best_cb.predict(X_test)

accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
conf_matrix_optimized = confusion_matrix(y_test, y_pred_optimized)
class_report_optimized = classification_report(y_test, y_pred_optimized)

print(f'Optimized Accuracy: {accuracy_optimized}')
print('Optimized Confusion Matrix:')
print(conf_matrix_optimized)
print('Optimized Classification Report:')
print(class_report_optimized)

Optimized Accuracy: 0.8641304347826086
Optimized Confusion Matrix:
[[143   1]
 [ 24  16]]
Optimized Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       144
           1       0.94      0.40      0.56        40

    accuracy                           0.86       184
   macro avg       0.90      0.70      0.74       184
weighted avg       0.87      0.86      0.84       184



In [11]:
#xgb
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [1,2,3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

grid_search = GridSearchCV(estimator=XGBClassifier(random_state=42), param_grid=xgb_params, cv=3, n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

best_xgb = grid_search.best_estimator_
best_xgb.fit(X_train, y_train)
y_pred_optimized = best_xgb.predict(X_test)

accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
conf_matrix_optimized = confusion_matrix(y_test, y_pred_optimized)
class_report_optimized = classification_report(y_test, y_pred_optimized)

print(f'Optimized Accuracy: {accuracy_optimized}')
print('Optimized Confusion Matrix:')
print(conf_matrix_optimized)
print('Optimized Classification Report:')
print(class_report_optimized)

Optimized Accuracy: 0.8695652173913043
Optimized Confusion Matrix:
[[143   1]
 [ 23  17]]
Optimized Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       144
           1       0.94      0.42      0.59        40

    accuracy                           0.87       184
   macro avg       0.90      0.71      0.75       184
weighted avg       0.88      0.87      0.85       184

