# Kaggle Titanic: 3 - Modelling and Predictions

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import ks_2samp, gaussian_kde

import missingno as msno

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, confusion_matrix,
                             classification_report, roc_auc_score,
                             roc_curve)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Add the "src" directory to the system path
project_root  = os.path.abspath('../')

if project_root not in sys.path:
    sys.path.append(project_root)

from src.utils import EnvironmentInfo, PathManager
from src.project_specific import run_dm_pipeline, plot_categorical_features_vs_target, plot_numerical_features_vs_target, chi2_categorical_features, chi2_all_categorical, analyze_correlation, cramers_v, chi_square_test_train, extract_most_represented_levels, one_hot_encode_drop_most_populated



## Working Environment

### Variables & Paths

In [None]:
path_manager = PathManager(dataset_name="titanic")
print("\n" + str(path_manager))

DATASET_PATH = path_manager.dataset_path
WORKING_PATH = path_manager.working_path

print(f"\nDATASET_PATH: {DATASET_PATH}")
print(f"WORKING_PATH: {WORKING_PATH}")

### Datasets

In [None]:
test = pd.read_csv(os.path.join(DATASET_PATH, "test.csv"))
train = pd.read_csv(os.path.join(DATASET_PATH, "train.csv"))

datasets_dict = {
    "train": train,
    "test": test
}

test_memory = test.memory_usage(deep=True).sum()
train_memory = train.memory_usage(deep=True).sum()
print(f"Test dataset memory usage: {test_memory / (1024**2):.2f} MB")
print(f"Train dataset memory usage: {train_memory / (1024**2):.2f} MB")

## Data Management

In [None]:
result = run_dm_pipeline(test, train)

categorical_features = result["categorical_features"]
numerical_features = result["numerical_features"]
target = result["target"]

In [None]:
test_df = result["test"][categorical_features + numerical_features]
train_df = result["train"][categorical_features + numerical_features + [target]]

## Modelling

### Logistic regression

In [None]:
# fit
logreg = LogisticRegression(max_iter=500)
logreg.fit(X_train_split, y_train_split)

# predict
y_pred = logreg.predict(X_val_split)

# evaluate
print(f'Accuracy: {accuracy_score(y_val_split, y_pred)}')

print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred))

print('Classification Report:')
print(classification_report(y_val_split, y_pred))

#### Hyperparameter Tuning

- C (Inverse of Regularization Strength):
A smaller value of C applies stronger regularization, while a larger value reduces regularization.
Expanding the range to [0.01, 0.1, 1, 10, 100, 1000] helps to explore more potential options for tuning regularization strength.

- penalty:
'l1' (Lasso) and 'l2' (Ridge) are commonly used penalties. By adding 'elasticnet', you can combine both L1 and L2 regularization (note: elasticnet requires the solver='saga').
'none' can be tested for a logistic regression without regularization.

- solver:
'liblinear': Good for small datasets and supports L1 and L2 penalties.
'saga': Supports both L1, L2, and elastic-net penalties, and is well-suited for large datasets.
'lbfgs': Supports L2 and is good for handling large datasets and multinomial classification.
'newton-cg': Works similarly to lbfgs and supports only the L2 penalty.

- class_weight:
Adding 'balanced' helps handle class imbalance by adjusting weights inversely proportional to class frequencies.

- max_iter:
Sometimes logistic regression models fail to converge with lower iteration limits. Testing a broader range of values ensures that your model is given enough iterations to converge.

In [None]:
param_grid = {
    'C': [10**(i) for i in range(-4, 3)],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],  # Added compatible solvers
    'max_iter': [5000, 10000, 50000]
}

grid_search = GridSearchCV(LogisticRegression(max_iter=50000), param_grid, cv=5, verbose=1, error_score='raise')
grid_search.fit(X_train_split, y_train_split)

In [None]:
print(f'Best Parameters: {grid_search.best_params_}')

In [None]:
logreg_final = LogisticRegression(max_iter=5000, C=10, penalty='l2', solver='liblinear')
logreg_final.fit(X_train, y_train)

# Predict on the test set
y_pred_final = logreg_final.predict(X_val_split)

# evaluate
print(f'Accuracy: {accuracy_score(y_val_split, y_pred_final)}')

print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred_final))

print('Classification Report:')
print(classification_report(y_val_split, y_pred_final))

Interpretability

In [None]:
# Coefficients 
coefficients = logreg_final.coef_[0]  

# Pair the feature names with their corresponding coefficients
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': coefficients
})

# Sort by the absolute value of the coefficient (optional)
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)

print(feature_importance)

In [None]:
# Visualize the feature importance using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=feature_importance)
plt.title('Logistic Regression Feature Importance (Coefficients)')
plt.show()

f1 score : 82%  
variables qui influent significativement sur la survie : Age, Pclass_3, Sexe_male, Title, SibSp, Parch.  
variables significatives sont differentes des feature importantes pour la classification.
car la reg logistique capture bien les relations linéaires, les effes d'une variables qd les autres sont constantes, mais pas les relations complexes.

In [None]:
from sklearn.decomposition import PCA
import plotly.graph_objs as go

# Apply PCA to reduce the features to 2 dimensions
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_val_split_pca = pca.transform(X_val_split)

# Get the percentage of variance explained by each component
explained_variance = pca.explained_variance_ratio_ * 100

# Train the logistic regression model on PCA-transformed data
logreg_final_pca = LogisticRegression(max_iter=5000, C=10, penalty='l2', solver='liblinear')
logreg_final_pca.fit(X_train_pca, y_train)

# Generate mesh grid for PCA space
x_min, x_max = X_train_pca[:, 0].min() - 1, X_train_pca[:, 0].max() + 1
y_min, y_max = X_train_pca[:, 1].min() - 1, X_train_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

# Predict on the grid
Z = logreg_final_pca.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Create contour plot for decision boundary
contour = go.Contour(
    z=Z,
    x=np.arange(x_min, x_max, 0.1),
    y=np.arange(y_min, y_max, 0.1),
    colorscale='Viridis',
    opacity=0.7,
    showscale=False,
)

# Plot the training points in PCA space
scatter = go.Scatter(
    x=X_train_pca[:, 0],
    y=X_train_pca[:, 1],
    mode='markers',
    marker=dict(
        color=y_train,
        colorscale='Portland',
        line=dict(width=1, color='black'),
        size=8
    ),
    text=['Survived: {}'.format(s) for s in y_train]
)

# Set layout for the plot, including percentage of explained variance in axis labels
layout = go.Layout(
    title="Logistic Regression Decision Boundary (PCA Projection)",
    xaxis=dict(title=f"PCA Component 1 ({explained_variance[0]:.2f}% variance explained)"),
    yaxis=dict(title=f"PCA Component 2 ({explained_variance[1]:.2f}% variance explained)"),
    hovermode='closest',
    showlegend=False
)

# Combine contour and scatter plots
fig = go.Figure(data=[contour, scatter], layout=layout)

# Show the plot
fig.show()


### Random Forest

Preprocessing

- Missing values : already dealt with
- One hot encoding
- feature scaling : not necessary
- Train-Test split

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# One-hot encode categorical features for train and test sets
encoded_train_df = one_hot_enc_choose_cat(train_df)
encoded_test_df = one_hot_enc_choose_cat(test_df)

# Split the target and features in the training data
X_train = encoded_train_df.drop(columns=['Survived'])
y_train = encoded_train_df['Survived']

X_test = encoded_test_df

# Split the training data into a train and validation set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Fit the model on the training data
rf_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_pred = rf_model.predict(X_val_split)

In [None]:
# Evaluate the model
print(f'Accuracy: {accuracy_score(y_val_split, y_pred)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred))
print('Classification Report:')
print(classification_report(y_val_split, y_pred))

Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_split, y_train_split)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Predict on the validation set using the best model
y_pred_best = best_rf_model.predict(X_val_split)

In [None]:
# Evaluate the tuned model
print(f'Accuracy after tuning: {accuracy_score(y_val_split, y_pred_best)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred_best))
print('Classification Report:')
print(classification_report(y_val_split, y_pred_best))

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')

Interpretability

Feature importance

In [None]:
# Get feature importance
feature_importances = best_rf_model.feature_importances_

# Create a DataFrame for the feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance in Random Forest Model')
plt.show()

### XGBoost

Preprocessing

- Missing values: already delt with
- One hot encoding
- featuyre scaling ?
- Train-test split

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# One-hot encode categorical features for train and test sets
encoded_train_df = one_hot_enc_choose_cat(train_df)
encoded_test_df = one_hot_enc_choose_cat(test_df)

# Split the target and features in the training data
X_train = encoded_train_df.drop(columns=['Survived'])
y_train = encoded_train_df['Survived']

X_test = encoded_test_df

# Split the training data into a train and validation set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Ensure the target variable is numeric (integer)
y_train_split = y_train_split.astype(int)
y_val_split = y_val_split.astype(int)


In [None]:
# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Fit the model on the training data
xgb_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_pred = xgb_model.predict(X_val_split)

In [None]:
# Evaluate the model
print(f'Accuracy: {accuracy_score(y_val_split, y_pred)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred))
print('Classification Report:')
print(classification_report(y_val_split, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_split, y_train_split)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Predict on the validation set using the best model
y_pred_best = best_xgb_model.predict(X_val_split)

# Evaluate the tuned model
print(f'Accuracy after tuning: {accuracy_score(y_val_split, y_pred_best)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred_best))
print('Classification Report:')
print(classification_report(y_val_split, y_pred_best))

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')


Interpretability

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importance from the best model
feature_importance = best_xgb_model.feature_importances_

# Create a DataFrame for the feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('XGBoost Feature Importance')
plt.show()


### KNN

Preprocessing

- Missing values: already delt with
- One hot encoding
- scaling
- Train-test split 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# One-hot encode categorical features using your existing function
encoded_train_df = one_hot_enc_choose_cat(train_df)
encoded_test_df = one_hot_enc_choose_cat(test_df)

# Scaling the features
scaler = StandardScaler()
encoded_train_df[numerical_features] = scaler.fit_transform(encoded_train_df[numerical_features])
encoded_test_df[numerical_features] = scaler.transform(encoded_test_df[numerical_features])


# Split the target and features in the training data
X_train = encoded_train_df.drop(columns=['Survived'])
y_train = encoded_train_df['Survived'].astype(int)  # Ensure the target is an integer


In [None]:
# Split the training data into a train and validation set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Initialize the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Fit the model on the training data
knn_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_pred = knn_model.predict(X_val_split)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_val_split, y_pred)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred))
print('Classification Report:')
print(classification_report(y_val_split, y_pred))

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13],
    'weights': ['uniform', 'distance'],  # Consider uniform or distance-based weighting
    'metric': ['euclidean', 'manhattan']  # Different distance metrics
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_split, y_train_split)

# Get the best model
best_knn_model = grid_search.best_estimator_

# Predict on the validation set using the best model
y_pred_best = best_knn_model.predict(X_val_split)

# Evaluate the tuned model
print(f'Accuracy after tuning: {accuracy_score(y_val_split, y_pred_best)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred_best))
print('Classification Report:')
print(classification_report(y_val_split, y_pred_best))

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')

Interpretability

In [None]:
import lime
import lime.lime_tabular

# Create a LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_split, feature_names=X_train.columns, class_names=['Died', 'Survived'], discretize_continuous=True)

# Select a single instance from the validation set
instance = X_val_split[0].reshape(1, -1)

# Generate explanation for that instance
exp = explainer.explain_instance(instance.flatten(), best_knn_model.predict_proba)
exp.show_in_notebook(show_all=False)

ameliorer le tuning, + ecrire une synthese de ce que fait chaque technique.

Comparer les perf de chaque modele, et les feature importance.
puis choisir le meilleur pour la soumission.

voir SHAP values, ROC curves, + d'hypermarameter tuning



### Clustering ?  

Pour trouver plus d'info sur les features qui separe les groupes, ou comment les groupes sont séparés.

