In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
train_df = pd.read_csv(os.path.join(os.pardir,"data","train_cleaned.csv"))

# Define features and target
X = train_df.drop(columns='Survived')
y = train_df['Survived']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Check all columns matching

X_train.columns == X_valid.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [4]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [5]:
# Define Base Models

logreg = LogisticRegression(max_iter=1000, random_state=42)

rf = RandomForestClassifier(
    n_estimators=497,
    max_depth=20,
    max_features='sqrt',
    min_samples_split=8,
    min_samples_leaf=3,
    bootstrap=True,
    random_state=42
)

xgb = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

In [6]:
# Create Voting Ensemble
ensemble = VotingClassifier(
    estimators=[
        ('logreg', logreg),
        ('rf', rf),
        ('xgb', xgb)
    ],
    voting='soft'  # average probabilities
)

In [7]:
# Train the Ensemble
ensemble.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [8]:
# Evaluate on Validation Set
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = ensemble.predict(X_valid)

print("Accuracy:", accuracy_score(y_valid, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
print("Classification Report:\n", classification_report(y_valid, y_pred))

Accuracy: 0.8156424581005587
Confusion Matrix:
 [[97 13]
 [20 49]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.88      0.85       110
           1       0.79      0.71      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179



In [9]:
# Predict on test data
test_cleaned = pd.read_csv(os.path.join(os.pardir,"data","test_cleaned.csv"))
test_df = pd.read_csv(os.path.join(os.pardir,"data","test.csv"))

test_preds_en = ensemble.predict(test_cleaned)

submission_en = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_preds_en
})

submission_en.to_csv(r'../models/ensemble/submission_en.csv', index=False)