In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score,
                             roc_curve, roc_auc_score)
import joblib
import dill as pickle

In [4]:
df = pd.read_csv('diabetes.csv')
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.shape

(768, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.epsilon = 1e-5

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        data = X.copy()
        data['PregnancyRatio'] = data['Pregnancies'] / (data['Age'] + self.epsilon)
        data['RiskScore'] = (0.5 * data['Glucose'] + 0.3 * data['BMI'] + 0.2 * data['Age'])
        data['InsulinEfficiency'] = (data['Insulin'] + self.epsilon) / (data['Glucose'] + self.epsilon)
        data['Glucose_BMI'] = (data['Glucose'] + self.epsilon) / (data['BMI'] + self.epsilon)
        data['BMI_Age'] = data['BMI'] * data['Age']
        return data

In [8]:
class WoEEncoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_bins = {
            'Pregnancies': [-np.inf, 1.7, 5.1, 8.5, np.inf],
            'Glucose': [-np.inf, 90.6, 119.4, 159.2, np.inf],
            'BMI': [-np.inf, 26.84, 38.26, np.inf],
            'RiskScore': [-np.inf, 55.61, 77.51, 99.41, np.inf],
        }
        self.woe_mappings = {}

    def fit(self, X, y):
        y = pd.Series(y, name='target')
        for feature, bins in self.feature_bins.items():
            X[f'{feature}_cat'] = pd.cut(X[feature], bins=bins)
            woe_df = self._calculate_woe(X, f'{feature}_cat', y)
            self.woe_mappings[feature] = woe_df.set_index(f'{feature}_cat')['WOE'].to_dict()
        return self

    def transform(self, X):
        data = X.copy()
        for feature in self.feature_bins.keys():
            data[f'{feature}_cat'] = pd.cut(data[feature], bins=self.feature_bins[feature])
            data[f'{feature}_woe'] = data[f'{feature}_cat'].map(self.woe_mappings[feature])
            data.drop(columns=[f'{feature}_cat'], inplace=True)
        return data

    def _calculate_woe(self, data, feature_name, y):
        data['target'] = y
        grouped = data.groupby(feature_name, observed=False)['target'].value_counts().unstack(fill_value=0)
        grouped.columns = ['non_events', 'events']
        grouped['event_rate'] = grouped['events'] / grouped['events'].sum()
        grouped['non_event_rate'] = grouped['non_events'] / grouped['non_events'].sum()
        grouped['WOE'] = np.log(grouped['event_rate'] / grouped['non_event_rate'])
        return grouped.reset_index()

In [9]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

In [10]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
cv = StratifiedKFold(n_splits=5,
                     shuffle=True,
                     random_state=42)

In [11]:
# Required columns after feature engineering and WoE encoding
selected_columns = [
    'Pregnancies', 'Glucose', 'BMI', 'PregnancyRatio',
    'RiskScore', 'InsulinEfficiency', 'Glucose_BMI', 'BMI_Age',
    'Glucose_woe', 'RiskScore_woe'
]

# Pipeline setup
pipeline = Pipeline([
    ('feature_engineering', FeatureEngineering()),
    ('woe_encoding', WoEEncoding()),
    ('column_selector', ColumnSelector(selected_columns)),
    ('random_forest', RandomForestClassifier(max_depth=6,
                                             n_estimators=300,
                                             criterion='entropy'))
])

# Train and evaluate
X = df.drop(columns=['Outcome'])
y = df['Outcome']
cv_score = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc').mean()
pipeline.fit(X, y)
print(f"Cross-validated AUC: {cv_score:.4f}")

Cross-validated AUC: 0.8378


In [12]:
thresholds = 0.32
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba >= thresholds).astype(int)

f1_result = f1_score(y_test, y_pred)
accuracy_result = accuracy_score(y_test, y_pred)
recall_result = recall_score(y_test, y_pred)
precision_result = precision_score(y_test, y_pred)

print(f"F1 Score: {f1_result:.4f}")
print(f"Accuracy Score: {accuracy_result:.4f}")
print(f"Recall Score: {recall_result:.4f}")
print(f"Precision Score: {precision_result:.4f}")
print(f"ROC AUC Score: {cv_score:.4f}")

F1 Score: 0.7612
Accuracy Score: 0.7922
Recall Score: 0.9444
Precision Score: 0.6375
ROC AUC Score: 0.8378
