In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import optuna
import joblib

# Load data
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

# Handle missing values
for dataset in [train_df, test_df]:
    dataset['Age'] = dataset.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())

train_df.drop(columns=['Cabin'], inplace=True)
test_df.drop(columns=['Cabin'], inplace=True)

# Encode categorical features
le = LabelEncoder()
for dataset in [train_df, test_df]:
    dataset['Sex'] = le.fit_transform(dataset['Sex'])
    dataset['Embarked'] = le.fit_transform(dataset['Embarked'])

# Feature Engineering
for dataset in [train_df, test_df]:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = np.where(dataset['FamilySize'] > 1, 0, 1)
    dataset['AgeBin'] = pd.cut(dataset['Age'], bins=[0, 12, 18, 50, 80], labels=[0, 1, 2, 3])
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4, labels=[0, 1, 2, 3])

train_df.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)
test_ids = test_df['PassengerId']
test_df.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)

# Split the data into features and target
X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

# Standardize numerical features
scaler = StandardScaler()
X[['Age', 'Fare']] = scaler.fit_transform(X[['Age', 'Fare']])
test_df[['Age', 'Fare']] = scaler.transform(test_df[['Age', 'Fare']])

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    # Choose a model
    model_name = trial.suggest_categorical('model', ['LogisticRegression', 'RandomForest', 'GradientBoosting'])

    # Hyperparameters for each model
    if model_name == 'LogisticRegression':
        model = LogisticRegression(max_iter=1000)
        param = {
            'C': trial.suggest_loguniform('C', 1e-5, 1e5),
            'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear'])
        }
    elif model_name == 'RandomForest':
        model = RandomForestClassifier(random_state=42)
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'max_depth': trial.suggest_int('max_depth', 5, 20),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4)
        }
    elif model_name == 'GradientBoosting':
        model = GradientBoostingClassifier(random_state=42)
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
            'max_depth': trial.suggest_int('max_depth', 3, 7),
            'subsample': trial.suggest_uniform('subsample', 0.7, 1.0)
        }
    
    # Set parameters and train model
    model.set_params(**param)
    model.fit(X_train, y_train)
    
    # Evaluate the model
    accuracy = accuracy_score(y_valid, model.predict(X_valid))
    
    return accuracy

# Create a study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best parameters
print("Best parameters: ", study.best_params)

# Train the best model
best_model_name = study.best_params['model']
best_model = None

# Create the model based on the best model name
if best_model_name == 'LogisticRegression':
    best_model = LogisticRegression(max_iter=1000)
    best_model.set_params(
        C=study.best_params['C'], 
        solver=study.best_params['solver']
    )
elif best_model_name == 'RandomForest':
    best_model = RandomForestClassifier(random_state=42)
    best_model.set_params(
        n_estimators=study.best_params['n_estimators'],
        max_depth=study.best_params['max_depth'],
        min_samples_split=study.best_params['min_samples_split'],
        min_samples_leaf=study.best_params['min_samples_leaf']
    )
elif best_model_name == 'GradientBoosting':
    best_model = GradientBoostingClassifier(random_state=42)
    best_model.set_params(
        n_estimators=study.best_params['n_estimators'],
        learning_rate=study.best_params['learning_rate'],
        max_depth=study.best_params['max_depth'],
        subsample=study.best_params['subsample']
    )

# Train the best model
best_model.fit(X_train, y_train)

# Evaluate on validation set
y_valid_pred = best_model.predict(X_valid)
print("Validation Accuracy:", accuracy_score(y_valid, y_valid_pred))
print("Classification Report:\n", classification_report(y_valid, y_valid_pred))

# Predict on test set
y_test_pred = best_model.predict(test_df)

# Save predictions to a CSV file
submission = pd.DataFrame({"PassengerId": test_ids, "Survived": y_test_pred})
submission.to_csv("../results/titanic_submission.csv", index=False)

# Save the best model
joblib.dump(best_model, '../model/best_titanic_model.pkl')

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-12-18 15:22:44,136] A new study created in memory with name: no-name-34416dc4-012a-422b-a7d5-f38c9bcaad71
[I 2024-12-18 15:22:44,383] Trial 0 finished with value: 0.8071748878923767 and parameters: {'model': 'RandomForest', 'n_estimators': 256, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8071748878923767.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.7, 1.0)
[I 2024-12-18 15:22:44,730] Trial 1 finished with value: 0.8161434977578476 and parameters: {'model': 'GradientBoosting', 'n_estimators': 172, 'learning_rate': 0.013994729535497262, 'max_depth': 6, 'subsample': 0.83127134038226}. Best is trial 1 with value: 0.8161434977578476.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.7, 1.0)
[I 2024-12-18 15:22:44,940] Trial 2 fin

Best parameters:  {'model': 'GradientBoosting', 'n_estimators': 161, 'learning_rate': 0.013583715227801349, 'max_depth': 6, 'subsample': 0.7945475301527058}
Validation Accuracy: 0.8430493273542601
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.92      0.88       134
           1       0.86      0.73      0.79        89

    accuracy                           0.84       223
   macro avg       0.85      0.82      0.83       223
weighted avg       0.84      0.84      0.84       223



['../model/best_titanic_model.pkl']