In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')
import os

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_encoded = pd.read_csv('df_encoded.csv')

In [3]:
## Handle Class Imbalance 
df_encoded['survived'].value_counts(normalize=True)

from imblearn.over_sampling import SMOTE

X = df_encoded.drop('survived', axis=1)
y = df_encoded['survived']

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_res)  # or X if no SMOTE

# Optionally convert back to DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_res, test_size=0.2, random_state=42, stratify=y_res)


In [5]:
print("Shape of X_train :",X_train.shape)
print("Shape of X_test:",X_test.shape)
print("Shape of y_train:",y_train.shape)
print("Shape of y_test:",y_test.shape)

Shape of X_train : (1110393, 14)
Shape of X_test: (277599, 14)
Shape of y_train: (1110393,)
Shape of y_test: (277599,)


In [7]:
# Create a dictionary to hold models and parameter grids
models_and_parameters = {
    "Logistic Regression": {
        "model": LogisticRegression(),
        "params": {
            'C': [0.01, 0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs'],
            'class_weight': ['balanced', None]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'class_weight': ['balanced']
        }
    },
    "SVM": {
        "model": SVC(probability=True),
        "params": {
            'C': [0.1, 1, 10],
            'kernel': ['rbf', 'linear'],
            'class_weight': ['balanced', None]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        "params": {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5],
            'scale_pos_weight': [1, 3]
        }
    }
}

In [8]:
models_and_parameters

{'Logistic Regression': {'model': LogisticRegression(),
  'params': {'C': [0.01, 0.1, 1, 10],
   'solver': ['liblinear', 'lbfgs'],
   'class_weight': ['balanced', None]}},
 'Random Forest': {'model': RandomForestClassifier(random_state=42),
  'params': {'n_estimators': [100, 200],
   'max_depth': [None, 10, 20],
   'min_samples_split': [2, 5],
   'class_weight': ['balanced']}},
 'SVM': {'model': SVC(probability=True),
  'params': {'C': [0.1, 1, 10],
   'kernel': ['rbf', 'linear'],
   'class_weight': ['balanced', None]}},
 'XGBoost': {'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric='logloss',
                feature_types=None, feature_weights=None, gamma=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_r

In [None]:
# Set up cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)  # n=2 because dataset is small

# To store results
results = {}

# Perform grid search for each model
for model_name, mp in models_and_parameters.items():
    print(f"\n🔍 Training and tuning {model_name}...")
    grid = GridSearchCV(mp["model"], mp["params"], cv=cv, scoring='f1', n_jobs=-1)
    grid.fit(X_train, y_train.values.ravel())
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, 'predict_proba') else None

    results[model_name] = {
        "best_params": grid.best_params_,
        "classification_report": classification_report(y_test, y_pred, output_dict=True),
        "confusion_matrix": confusion_matrix(y_test, y_pred).tolist(),
        "roc_auc": roc_auc_score(y_test, y_proba) if y_proba is not None and len(set(y_test)) > 1 else None
    }

results



🔍 Training and tuning Logistic Regression...

🔍 Training and tuning Random Forest...
