# 5. Predictive Modeling


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
final_dt = pd.read_csv("C:/Research/Msc/CMM709/CAUSALITY-EXPLORE/data/processed/medical_appointment_no_show_final.csv")

In [None]:
final_dt

In [None]:
final_dt.info()

## 5.1 Split the Dataset

In [None]:
# Define features (x) and target (y)
x = final_dt.drop(columns=['no_show'])
y = final_dt['no_show']

# Convert categorical variables to numerical using one-hot encoding
X = pd.get_dummies(x, drop_first=True)

# Impute missing values with the mean (or median/mode)
# impute = SimpleImputer(strategy='mean')
# X = pd.DataFrame(impute.fit_transform(X), columns=X.columns)

# Split the dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the shapes of the resulting datasets
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

## 5.2 Train and Evaluate Models
<p>
  Train and evaluate three models
    <ol>
       <li>Logistic Regression</li>
       <li>Random Forest</li>
       <li>XGBoost</li>
    </ol>
</p>

In [None]:
from xgboost import XGBClassifier
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
# Function to evaluate model performance
def evaluate_model(model, X_train, y_train, X_test, y_test):

    # Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Train the model
    model.fit(X_train_smote, y_train_smote)

    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

    # Evaluate to model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)  # ROC AUC score

    cm = confusion_matrix(y_test, y_pred)

    graph_report = classification_report(y_test, y_pred, output_dict=True)
    class_report = pd.DataFrame(graph_report).transpose()

    # Print evaluation metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")

    # Check the new class distribution
    print("Class Imbalance handling SMOTE\n")
    print("--------------------------------")
    print(f"Before SMOTE:", Counter(y_train))
    print(f"After SMOTE:", Counter(y_train_smote))

    print("\n")
    print("Classification Report:")
    print("----------------------")
    print(class_report)

    # Plot confusion matrix
    print("\n")
    print("Confusion Matrix:")
    print("-----------------")
    plt.figure(figsize=(8, 4))

    plt.subplot(1, 2, 1)
    sns.heatmap(class_report, annot=True, fmt='.2f', cmap='crest', cbar=False)
    plt.title('Classification Report')

    plt.subplot(1, 2, 2)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', cbar=False)
    plt.title('Confusion Matrix')

    plt.show()

    return model

In [None]:
# Initialize models
logistic_regression_model = LogisticRegression(random_state=42, max_iter=10000, class_weight='balanced')
random_forest_classifier_model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
XGBClassifier_model = XGBClassifier(n_estimators=200, random_state=42)

### Evaluate Logistic Regression

In [None]:
print("Logistic Regression Results:")
print("----------------------------")
log_reg = evaluate_model(logistic_regression_model, X_train, y_train, X_test, y_test)

### Evaluate Random Forest Classifier

In [None]:
print("Evaluating Random Forest Model:")
print("-------------------------------")
rf_classifier = evaluate_model(random_forest_classifier_model, X_train, y_train, X_test, y_test)

### Evaluate XGBoost Classifier

In [None]:
print("XGBoost Classifier Results:")
print("---------------------------")
xgboost_model = evaluate_model(XGBClassifier_model, X_train, y_train, X_test, y_test)

### 5.3 Improve Model Performance

In [None]:
# Drop one-hot encoded neighbourhood columns to reduce dimensionality
X_reduced =  final_dt.drop([col for col in final_dt.columns if col.startswith('neighbourhood_')], axis=1)

# Resplit reduced data
X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(X_reduced, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train_reduced.shape}")
print(f"Test set shape: {X_test_reduced.shape}")


In [None]:
evaluate_model(logistic_regression_model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)
evaluate_model(random_forest_classifier_model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)
evaluate_model(XGBClassifier_model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)

### 5.4 Hyperparameter Tuning
<p>

Using `GridSearchCV` to tune `Hyperparameter` for the best-performing model, Random Forest

</p>

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
# Define the parameter grid for each model
param_grid_log_reg_a = {
    'C': [0.01, 0.1, 1, 10, 100],    # Regularization strength
    'penalty': ['l1', 'l2'],         # Regularization type
    'solver': ['liblinear', 'saga']  # Solvers that support l1 penalty
}

param_grid_random_forest_a = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30], # Maximum depth of the tree
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]    # Minimum number of samples required to be at a leaf node
}

param_grid_xgb_a = {
    'n_estimators': [50, 100, 200],       # Number of trees in the ensemble
    'learning_rate': [0.01, 0.1, 0.2],    # Step size shrinkage used in update to prevent overfitting
    'max_depth': [3, 5, 7],               # Maximum depth of a tree
    'subsample': [0.8, 0.9, 1.0],         # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 0.9, 1.0]   # Subsample ratio of columns when constructing each tree
}

#### 1. GridSearchCV

In [None]:
# Performance GridSearchCV for Logistic Regression
print("Tuning Logistic Regression....")
grid_log_reg = GridSearchCV(logistic_regression_model, param_grid_log_reg_a, n_jobs=10, cv=5, scoring='accuracy', verbose=3)
grid_log_reg.fit(X_train, y_train)
print(f"Best parameters for Logistic Regression: {grid_log_reg.best_params_}")
print(f"Best cross-validation score for Logistic Regression: {grid_log_reg.best_score_:.4f}")

In [None]:
# Performance GridSearchCV for Random Forest
print("\nTuning Random Forest...")
grid_rf = GridSearchCV(random_forest_classifier_model, param_grid_random_forest_a, n_jobs=10, cv=5, scoring='accuracy', verbose=3)
grid_rf.fit(X_train, y_train)
print(f"Best parameters for Random Forest: {grid_rf.best_params_}")
print(f"Best cross-validation score for Random Forest: {grid_rf.best_score_:.4f}")

In [None]:
# Performance GridSearchCV for XGBoost
print("\nTuning XGBoost...")
grid_xgb = GridSearchCV(XGBClassifier_model, param_grid_xgb_a, n_jobs=10, cv=5, scoring='accuracy', verbose=3)
grid_xgb.fit(X_train, y_train)
print(f"Best parameters for XGBoost: {grid_xgb.best_params_}")
print(f"Best cross-validation score for XGBoost: {grid_xgb.best_score_:.4f}")

#### 2. RandomizedSearchCV

In [None]:
# Perform RandomizedSearchCV for Logistic Regression
print("\nTuning Logistic Regression...")
random_log_reg = RandomizedSearchCV(logistic_regression_model, param_grid_log_reg_a, n_iter=10, cv=5, scoring='accuracy', verbose=3)
random_log_reg.fit(X_train, y_train)
print(f"Best parameters for Logistic Regression:", random_log_reg.best_params_)
print(f"Best cross-validation score for Logistic Regression:", random_log_reg.best_score_)

In [None]:
# Perform RandomizedSearchCV for Random Forest
print("\nTuning Random Forest...")
random_rf = RandomizedSearchCV(random_forest_classifier_model, param_grid_random_forest_a, n_iter=10, cv=5, scoring='accuracy', verbose=3)
random_rf.fit(X_train, y_train)
print(f"Best parameters for Random Forest:", random_rf.best_params_)
print(f"Best cross-validation score for Random Forest:", random_rf.best_score_)

In [None]:
# Perform RandomizedSearchCV for XGBoost
print("\nTuning XGBoost...")
random_xgb = RandomizedSearchCV(XGBClassifier_model, param_grid_xgb_a, n_iter=10, cv=5, scoring='accuracy', verbose=3)
random_xgb.fit(X_train, y_train)
print(f"Best parameters for XGBoost:", random_xgb.best_params_)
print(f"Best cross-validation score for XGBoost:", random_xgb.best_score_)

#### 3. RandomizedSearchCV

In [None]:
# Perform RandomizedSearchCV for Logistic Regression
print("\nTuning Logistic Regression...")
random_log_reg = RandomizedSearchCV(logistic_regression_model, param_grid_log_reg_a, n_iter=10, cv=5, scoring='accuracy', verbose=3)
random_log_reg.fit(X_train, y_train)
print(f"Best parameters for Logistic Regression:", random_log_reg.best_params_)
print(f"Best cross-validation score for Logistic Regression:", random_log_reg.best_score_)

In [None]:
# Perform RandomizedSearchCV for Random Forest
print("\nTuning Random Forest...")
random_rf = RandomizedSearchCV(random_forest_classifier_model, param_grid_random_forest_a, n_iter=10, cv=5, scoring='accuracy', verbose=3)
random_rf.fit(X_train, y_train)
print(f"Best parameters for Random Forest:", random_rf.best_params_)
print(f"Best cross-validation score for Random Forest:", random_rf.best_score_)

In [None]:
# Perform RandomizedSearchCV for XGBoost
print("\nTuning XGBoost...")
random_xgh = RandomizedSearchCV(XGBClassifier_model, param_grid_xgb_a, n_iter=10, cv=5, scoring='accuracy', verbose=3)
random_xgh.fit(X_train, y_train)
print(f"Best parameters for XGBoost:", random_xgh.best_params_)
print(f"Best cross-validation score for XGBoost:", random_xgh.best_score_)