### Modelling 

In [1]:
#Importing neccesary packages
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from xgboost import XGBClassifier

In [2]:
# Loading the data 
data = pd.read_csv('Data/Preprocessed_data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   gender                   100000 non-null  object 
 1   age                      100000 non-null  float64
 2   hypertension             100000 non-null  int64  
 3   heart_disease            100000 non-null  int64  
 4   smoking_history          100000 non-null  object 
 5   encoded_smoking_history  100000 non-null  int64  
 6   bmi                      100000 non-null  float64
 7   HbA1c_level              100000 non-null  float64
 8   blood_glucose_level      100000 non-null  int64  
 9   diabetes                 100000 non-null  int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 7.6+ MB


In [4]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,encoded_smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,4,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,0,27.32,6.6,80,0
2,Male,28.0,0,0,never,4,27.32,5.7,158,0
3,Female,36.0,0,0,current,1,23.45,5.0,155,0
4,Male,76.0,1,1,current,1,20.14,4.8,155,0


In [5]:
# Split the dataset into features (X) and target (y)
X = data.drop(['diabetes', 'gender', 'smoking_history'], axis=1)
y = data['diabetes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Convert the data into DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# Define the parameters for the XGBoost model
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,              # Increase max_depth to increase model complexity
    'learning_rate': 0.1,        # You can experiment with different learning rates
    'n_estimators': 200,        # Increase n_estimators for a more complex model
    'gamma': 0.5,                # Adjust gamma as needed for complexity control
    'subsample': 0.8,            # You can experiment with different subsample values
    'colsample_bytree': 0.8,     # You can experiment with different colsample_bytree values
    'alpha': 0.1,      # L1 regularization (Lasso)
    'lambda': 0.1,     # L2 regularization (Ridge)
}

# Create and train the XGBoost model using XGBClassifier
#model = XGBClassifier(**params)
#model.fit(X_train, y_train)


#### Strategies to address data imbalance in the model

In [7]:
# Calculate class weights
class_weights = len(y_train) / np.bincount(y_train)

# Create and train the XGBoost model with class weights
model = XGBClassifier(scale_pos_weight=class_weights[0] / class_weights[1], **params)
model.fit(X_train, y_train)

XGBClassifier(alpha=0.1, base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=0.5, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None, lambda=0.1,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, ...)

In [8]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [9]:
from imblearn.over_sampling import SMOTE

In [10]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [11]:
model = XGBClassifier(scale_pos_weight=class_weights[0] / class_weights[1], **params)
model.fit(X_train_resampled, y_train_resampled)

XGBClassifier(alpha=0.1, base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=0.5, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None, lambda=0.1,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, ...)

In [12]:
# Predict the target variable for the test set
y_pred = model.predict(X_test)


In [13]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.97235


In [14]:
# Calculate the Mean Squared Error (MSE) score
mse_score = mean_squared_error(y_test, y_pred)
print(f"MSE Score: {mse_score}")

MSE Score: 0.02765


In [15]:
# Print the confusion matrix to see true positive, true negative, false positive, and false negative
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[18292     0]
 [  553  1155]]


In [16]:
# Perform cross-validation to estimate model performance
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:")
print(cv_scores)
print(f"Mean CV Accuracy: {np.mean(cv_scores)}")

Cross-Validation Scores:
[0.97215 0.9726  0.96955 0.973   0.97205]
Mean CV Accuracy: 0.97187


In [17]:
from sklearn.metrics import classification_report

# Predict the target variable for the train set
y_train_pred = model.predict(X_train)

# Calculate precision, recall, and F1-score for the train set
train_report = classification_report(y_train, y_train_pred, target_names=['Non-Diabetes', 'Diabetes'])

print("Training Set Classification Report:")
print(train_report)


Training Set Classification Report:
              precision    recall  f1-score   support

Non-Diabetes       0.97      1.00      0.98     73208
    Diabetes       1.00      0.67      0.80      6792

    accuracy                           0.97     80000
   macro avg       0.99      0.83      0.89     80000
weighted avg       0.97      0.97      0.97     80000



In [18]:
# Predict the target variable for the test set
y_test_pred = model.predict(X_test)

# Calculate precision, recall, and F1-score for the test set
test_report = classification_report(y_test, y_test_pred, target_names=['Non-Diabetes', 'Diabetes'])

print("Test Set Classification Report:")
print(test_report)


Test Set Classification Report:
              precision    recall  f1-score   support

Non-Diabetes       0.97      1.00      0.99     18292
    Diabetes       1.00      0.68      0.81      1708

    accuracy                           0.97     20000
   macro avg       0.99      0.84      0.90     20000
weighted avg       0.97      0.97      0.97     20000



In [None]:
# Function to preprocess the user input data
def preprocess_user_data(user_data):
    # Encode gender
    user_data['gender'] = user_data['gender'].apply(lambda x: 1 if x.lower() == 'male' else 0)

    # Encode smoking_history
    smoking_mapping = {
        'never': 4,
        'No Info': 0,
        'current': 1,
        'former': 3,
        'ever': 2,
        'not current': 5
    }
    user_data['smoking_history'] = user_data['smoking_history'].map(smoking_mapping)

    return user_data

# Function to take input from the user and make predictions
def predict_diabetes(model):
    # Get user input for features
    gender = input("Enter gender (Male/Female): ")
    age = float(input("Enter age: "))
    hypertension = int(input("Enter hypertension (0 for No, 1 for Yes): "))
    heart_disease = int(input("Enter heart disease (0 for No, 1 for Yes): "))
    smoking_history = input("Enter smoking history (never/No Info/current/former/ever/not current): ")
    bmi = float(input("Enter BMI: "))
    hba1c_level = float(input("Enter HbA1c level: "))
    blood_glucose_level = int(input("Enter blood glucose level: "))

    # Create a DataFrame with the user input
    user_data = pd.DataFrame({
        'gender': [gender],
        'age': [age],
        'hypertension': [hypertension],
        'heart_disease': [heart_disease],
        'smoking_history': [smoking_history],
        'bmi': [bmi],
        'HbA1c_level': [hba1c_level],
        'blood_glucose_level': [blood_glucose_level]
    })

    # Preprocess user input
    user_data = preprocess_user_data(user_data)

    # Ensure the user input data has the same columns as the training data
    missing_cols = set(X_train.columns) - set(user_data.columns)
    for col in missing_cols:
        user_data[col] = 0

    user_data = user_data[X_train.columns]

    # Use the trained model to make predictions on the preprocessed user input
    prediction = model.predict_proba(user_data)[:, 1][0]

    # Interpret the prediction
    if prediction < 0.5:
        print("Based on the input, the person is predicted to NOT have diabetes.")
    else:
        print("Based on the input, the person is predicted to have diabetes.")

# Call the predict_diabetes function to make predictions
predict_diabetes(model)


In [None]:
predict_diabetes(model)

In [None]:
predict_diabetes(model)