# Kaggle Titanic: 2 - Data Analysis

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import ks_2samp, gaussian_kde

import missingno as msno

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, confusion_matrix,
                             classification_report, roc_auc_score,
                             roc_curve)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Add the "src" directory to the system path
project_root  = os.path.abspath('../')

if project_root not in sys.path:
    sys.path.append(project_root)

from src.utils import EnvironmentInfo, PathManager
from src.project_specific import create_data_dictionnary_df, DataOverview, run_dm_pipeline


## Working Environment

### Variables & Paths

In [2]:
path_manager = PathManager(dataset_name="titanic")
print("\n" + str(path_manager))

DATASET_PATH = path_manager.dataset_path
WORKING_PATH = path_manager.working_path

print(f"\nDATASET_PATH: {DATASET_PATH}")
print(f"WORKING_PATH: {WORKING_PATH}")

Running on local.
Dataset directory: c:\Users\Florent\Documents\data_science\kaggle-titanic\data
Working directory: c:\Users\Florent\Documents\data_science\kaggle-titanic\data\working

Dataset Path: c:\Users\Florent\Documents\data_science\kaggle-titanic\data
Working Path: c:\Users\Florent\Documents\data_science\kaggle-titanic\data\working

DATASET_PATH: c:\Users\Florent\Documents\data_science\kaggle-titanic\data
WORKING_PATH: c:\Users\Florent\Documents\data_science\kaggle-titanic\data\working


### Datasets

In [3]:
test = pd.read_csv(os.path.join(DATASET_PATH, "test.csv"))
train = pd.read_csv(os.path.join(DATASET_PATH, "train.csv"))

datasets_dict = {
    "train": train,
    "test": test
}

test_memory = test.memory_usage(deep=True).sum()
train_memory = train.memory_usage(deep=True).sum()
print(f"Test dataset memory usage: {test_memory / (1024**2):.2f} MB")
print(f"Train dataset memory usage: {train_memory / (1024**2):.2f} MB")

Test dataset memory usage: 0.13 MB
Train dataset memory usage: 0.28 MB


## Data Management

In [4]:
result = run_dm_pipeline(test, train)

test_processed = result["test"]
train_processed = result["train"]
categorical_features = result["categorical_features"]
numerical_features = result["numerical_features"]
target = result["target"]

Target Variable: Survived
Target Variable 'Survived' not found in DataFrame test.
'Embarked' is S for passenger at index 152. Imputing 'Fare' with median fare 8.05 for 'Pclass' 3 and 'Embarked' S.
Remaining missing 'Fare' values in test dataset: 0
Remaining missing 'Embarked' values in train dataset: 0
Remaining missing 'Age' values in the combined dataset: 0

Categorical_features: ['Pclass', 'Sex', 'Embarked', 'Simplified_Title', 'Deck', 'Is_Alone']
Numerical features: ['PassengerId', 'Age', 'Fare', 'Family_Size']
target: Survived


## Bivariate Analysis vs Target

In [None]:
# Bivariate Analysis
print("\n=== Bivariate Analysis ===")

# Relationship between Categorical Features and Target
for feature in categorical_features:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=train, x=feature, hue=target, order=train[feature].value_counts().index)
    plt.title(f'{feature} vs {target}')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.legend(title=target)
    plt.show()

# Relationship between Numerical Features and Target
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=target, y=feature, data=train)
    plt.title(f'{feature} vs {target}')
    plt.xlabel(target)
    plt.ylabel(feature)
    plt.show()

#### Outlier Detection

In [None]:
# Outlier Detection
print("\n=== Outlier Detection ===")
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=train[feature])
    plt.title(f'Boxplot of {feature}')
    plt.xlabel(feature)
    plt.show()

#### Correlation Analysis
##### Numerical variables

In [None]:
# Correlation Analysis
print("\n=== Correlation Analysis ===")
# corr = train.select_dtypes(include=[np.number]).corr()
corr = train[numerical_features].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

##### Categorical variables

In [None]:
from scipy.stats import chi2_contingency

pclass_sex_ct = pd.crosstab(train['Pclass'], train['Sex'], normalize='index') * 100
chi2, p, dof, expected = chi2_contingency(pclass_sex_ct)
print(pclass_sex_ct)
print("Chi-square statistic:", chi2)
print("p-value:", p)


In [None]:
pclass_deck_ct = pd.crosstab(train['Pclass'], train['Deck'])
chi2, p, dof, expected = chi2_contingency(pclass_deck_ct)
print(pclass_deck_ct)

print("Chi-square statistic:", chi2)
print("p-value:", p)

In [None]:
pclass_embarked_ct = pd.crosstab(train['Pclass'], train['Embarked'])
chi2, p, dof, expected = chi2_contingency(pclass_embarked_ct)
print(pclass_embarked_ct)

print("Chi-square statistic:", chi2)
print("p-value:", p)

In [None]:
deck_embarked_ct = pd.crosstab(train['Deck'], train['Embarked'])
chi2, p, dof, expected = chi2_contingency(deck_embarked_ct)
print(deck_embarked_ct)

print("Chi-square statistic:", chi2)
print("p-value:", p)

- It seems that 3rd class passengers are more likely to embark from Southampton, and 1st class passengers are more likely to embark from Cherbourg.
- there is a link between Pclass and Deck but there might be too many categories for a chi2

###### One hot encoding

In [211]:
# one hot encoding
train_df = pd.get_dummies(train, columns=['Pclass', 'Sex', 'Embarked', 'Simplified_Title', 'Deck'], drop_first=True)
test_df = pd.get_dummies(test, columns=['Pclass', 'Sex', 'Embarked', 'Simplified_Title', 'Deck'], drop_first=True)

###### Correlations between the levels of the categorical variables

In [None]:
# Convert categorical variables to dummy/one-hot encoded variables
encoded_df = pd.get_dummies(train[['Pclass', 'Sex', 'Embarked', 'Simplified_Title', 'Deck']], drop_first=True)

# Calculate the correlation matrix on the encoded data
correlation_matrix = encoded_df.corr()

# Display the correlation matrix
print(correlation_matrix)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

In [None]:
# Filter the correlation matrix for correlations with an absolute value of at least 0.6
filtered_corr_matrix = correlation_matrix[(correlation_matrix.abs() >= 0.6)]

# Drop rows and columns that are entirely NaN (where all correlations were less than 0.6)
filtered_corr_matrix.dropna(how='all', axis=0, inplace=True)
filtered_corr_matrix.dropna(how='all', axis=1, inplace=True)

# Print the filtered correlation matrix
print(filtered_corr_matrix)

# Optional: Visualize the filtered correlation matrix with a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(filtered_corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

#### Train vs Test

In [None]:
# Compare summary statistics of numerical features
numerical_features = ['Age', 'Fare', 'Parch', 'SibSp']

train_summary = train[numerical_features].describe()
test_summary = test[numerical_features].describe()

print("Train Set Summary Statistics:")
print(train_summary)
print("\nTest Set Summary Statistics:")
print(test_summary)

In [None]:
from scipy.stats import ks_2samp

# Perform Kolmogorov-Smirnov test for numerical features
for feature in numerical_features:
    ks_stat, p_value = ks_2samp(train[feature].dropna(), test[feature].dropna())
    print(f"{feature} - KS Statistic: {ks_stat}, P-value: {p_value}")

In [None]:
# Function to perform Chi-Square test for a categorical feature
def chi_square_test(train, test, feature):
    train_counts = train[feature].value_counts()
    test_counts = test[feature].value_counts()

    # Combine into a contingency table
    contingency_table = pd.DataFrame([train_counts, test_counts], index=['Train', 'Test']).fillna(0)

    # Perform the Chi-Square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    return chi2, p

# Perform Chi-Square tests for each categorical feature
for feature in categorical_features:
    chi2, p = chi_square_test(train, test, feature)
    print(f"{feature} - Chi-Square: {chi2}, P-value: {p}")

- Pclass and Embarked distributions are not the same between Test and Train.

#### Datasets for analysis

- numerical features:  ['PassengerId', 'Age', 'SibSp', 'Parch', 'Fare'] 
- categorical features:  ['Pclass', 'Simplified_Title', 'Sex', 'Deck', 'Embarked'] 
- target:  Survived

In [None]:
train_df = train[numerical_features + categorical_features + ['Survived']]
test_df = test[numerical_features + categorical_features]

print(train_df.head())
print(test_df.head())

##### One hot encoding

We can choose the most represented category as the level to drop.

In [None]:
print(train.Pclass.value_counts())
print(train.Embarked.value_counts())
print(train.Sex.value_counts())
print(train.Simplified_Title.value_counts())
print(train.Deck.value_counts())

In [220]:
def one_hot_enc_choose_cat(df):
    """ 
    Function to one hot encode categorical features and drop the most represented category.
    """
    encoded_df = pd.get_dummies(df, columns=['Pclass', 'Sex', 'Embarked', 'Simplified_Title', 'Deck'], drop_first=False)
    encoded_df = encoded_df.drop(columns=['Pclass_3', 'Sex_male', 'Simplified_Title_Commoner', 'Deck_Unknown', 'Embarked_S'])
    return encoded_df

In [221]:
encoded_train_df = one_hot_enc_choose_cat(train_df)
encoded_test_df = one_hot_enc_choose_cat(test_df)

## Analyses

### Logistic Regression

In [222]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

#### Preprocessing

- Missing values
- Encoding categorical features
- Scaling numerical features
- Train-Test split for validation

##### Missing Values

In [223]:
train_df.loc[:, 'Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df.loc[:, 'Age'] = test_df['Age'].fillna(test_df['Age'].median())

train_df.loc[:, 'Fare'] = train_df['Fare'].fillna(train_df['Fare'].median())
test_df.loc[:, 'Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())

train_df.loc[:, 'Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
test_df.loc[:, 'Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode()[0])

In [None]:
print("train:\n", train_df.isna().sum(), '\n')
print("test:\n", test_df.isna().sum(), '\n')

##### Encoding categorical features

In [225]:
# one hot encoding
encoded_train_df = pd.get_dummies(train_df, columns=['Pclass', 'Sex', 'Embarked', 'Simplified_Title', 'Deck'], drop_first=True)
encoded_test_df = pd.get_dummies(test_df, columns=['Pclass', 'Sex', 'Embarked', 'Simplified_Title', 'Deck'], drop_first=True)

#####  Scaling numerical features

In [226]:
scaler = StandardScaler()

encoded_train_df[numerical_features] = scaler.fit_transform(encoded_train_df[numerical_features])
encoded_test_df[numerical_features] = scaler.transform(encoded_test_df[numerical_features])

##### Train-Test split

In [227]:
X_train = encoded_train_df.drop('Survived', axis=1)
y_train = encoded_train_df['Survived']

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

#### Modeling

In [None]:
# fit
logreg = LogisticRegression(max_iter=500)
logreg.fit(X_train_split, y_train_split)

# predict
y_pred = logreg.predict(X_val_split)

# evaluate
print(f'Accuracy: {accuracy_score(y_val_split, y_pred)}')

print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred))

print('Classification Report:')
print(classification_report(y_val_split, y_pred))

##### Hyperparameter Tuning

- C (Inverse of Regularization Strength):
A smaller value of C applies stronger regularization, while a larger value reduces regularization.
Expanding the range to [0.01, 0.1, 1, 10, 100, 1000] helps to explore more potential options for tuning regularization strength.

- penalty:
'l1' (Lasso) and 'l2' (Ridge) are commonly used penalties. By adding 'elasticnet', you can combine both L1 and L2 regularization (note: elasticnet requires the solver='saga').
'none' can be tested for a logistic regression without regularization.

- solver:
'liblinear': Good for small datasets and supports L1 and L2 penalties.
'saga': Supports both L1, L2, and elastic-net penalties, and is well-suited for large datasets.
'lbfgs': Supports L2 and is good for handling large datasets and multinomial classification.
'newton-cg': Works similarly to lbfgs and supports only the L2 penalty.

- class_weight:
Adding 'balanced' helps handle class imbalance by adjusting weights inversely proportional to class frequencies.

- max_iter:
Sometimes logistic regression models fail to converge with lower iteration limits. Testing a broader range of values ensures that your model is given enough iterations to converge.

In [None]:
param_grid = {
    'C': [10**(i) for i in range(-4, 3)],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],  # Added compatible solvers
    'max_iter': [5000, 10000, 50000]
}

grid_search = GridSearchCV(LogisticRegression(max_iter=50000), param_grid, cv=5, verbose=1, error_score='raise')
grid_search.fit(X_train_split, y_train_split)

In [None]:
print(f'Best Parameters: {grid_search.best_params_}')

In [None]:
logreg_final = LogisticRegression(max_iter=5000, C=10, penalty='l2', solver='liblinear')
logreg_final.fit(X_train, y_train)

# Predict on the test set
y_pred_final = logreg_final.predict(X_val_split)

# evaluate
print(f'Accuracy: {accuracy_score(y_val_split, y_pred_final)}')

print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred_final))

print('Classification Report:')
print(classification_report(y_val_split, y_pred_final))

Interpretability

In [None]:
# Coefficients 
coefficients = logreg_final.coef_[0]  

# Pair the feature names with their corresponding coefficients
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': coefficients
})

# Sort by the absolute value of the coefficient (optional)
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)

print(feature_importance)

In [None]:
# Visualize the feature importance using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=feature_importance)
plt.title('Logistic Regression Feature Importance (Coefficients)')
plt.show()

P-values  
The pvalues are not provided with sklearn, we would need to use statsmodels.

In [234]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np

In [None]:
# Convert all boolean columns to integers (0, 1)
X_train = X_train.astype(int)
y_train = y_train.astype(int)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Create a DataFrame for X_train_scaled with the original column names
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# Add a constant for the intercept
X_train_scaled_df = sm.add_constant(X_train_scaled_df)

# Fit the logistic regression model using the DataFrame with named columns
logit_model = sm.Logit(y_train, X_train_scaled_df)
result = logit_model.fit(method='bfgs', maxiter=50000)

# Print the summary (includes p-values, coefficients, etc.)
print(result.summary())

f1 score : 82%  
variables qui influent significativement sur la survie : Age, Pclass_3, Sexe_male, Title, SibSp, Parch.  
variables significatives sont differentes des feature importantes pour la classification.
car la reg logistique capture bien les relations linéaires, les effes d'une variables qd les autres sont constantes, mais pas les relations complexes.

In [None]:
from sklearn.decomposition import PCA
import plotly.graph_objs as go

# Apply PCA to reduce the features to 2 dimensions
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_val_split_pca = pca.transform(X_val_split)

# Get the percentage of variance explained by each component
explained_variance = pca.explained_variance_ratio_ * 100

# Train the logistic regression model on PCA-transformed data
logreg_final_pca = LogisticRegression(max_iter=5000, C=10, penalty='l2', solver='liblinear')
logreg_final_pca.fit(X_train_pca, y_train)

# Generate mesh grid for PCA space
x_min, x_max = X_train_pca[:, 0].min() - 1, X_train_pca[:, 0].max() + 1
y_min, y_max = X_train_pca[:, 1].min() - 1, X_train_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

# Predict on the grid
Z = logreg_final_pca.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Create contour plot for decision boundary
contour = go.Contour(
    z=Z,
    x=np.arange(x_min, x_max, 0.1),
    y=np.arange(y_min, y_max, 0.1),
    colorscale='Viridis',
    opacity=0.7,
    showscale=False,
)

# Plot the training points in PCA space
scatter = go.Scatter(
    x=X_train_pca[:, 0],
    y=X_train_pca[:, 1],
    mode='markers',
    marker=dict(
        color=y_train,
        colorscale='Portland',
        line=dict(width=1, color='black'),
        size=8
    ),
    text=['Survived: {}'.format(s) for s in y_train]
)

# Set layout for the plot, including percentage of explained variance in axis labels
layout = go.Layout(
    title="Logistic Regression Decision Boundary (PCA Projection)",
    xaxis=dict(title=f"PCA Component 1 ({explained_variance[0]:.2f}% variance explained)"),
    yaxis=dict(title=f"PCA Component 2 ({explained_variance[1]:.2f}% variance explained)"),
    hovermode='closest',
    showlegend=False
)

# Combine contour and scatter plots
fig = go.Figure(data=[contour, scatter], layout=layout)

# Show the plot
fig.show()


### Random Forest

Preprocessing

- Missing values : already dealt with
- One hot encoding
- feature scaling : not necessary
- Train-Test split

In [237]:
from sklearn.ensemble import RandomForestClassifier

In [238]:
# One-hot encode categorical features for train and test sets
encoded_train_df = one_hot_enc_choose_cat(train_df)
encoded_test_df = one_hot_enc_choose_cat(test_df)

# Split the target and features in the training data
X_train = encoded_train_df.drop(columns=['Survived'])
y_train = encoded_train_df['Survived']

X_test = encoded_test_df

# Split the training data into a train and validation set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Modelling

In [239]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Fit the model on the training data
rf_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_pred = rf_model.predict(X_val_split)

In [None]:
# Evaluate the model
print(f'Accuracy: {accuracy_score(y_val_split, y_pred)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred))
print('Classification Report:')
print(classification_report(y_val_split, y_pred))

Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_split, y_train_split)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Predict on the validation set using the best model
y_pred_best = best_rf_model.predict(X_val_split)

In [None]:
# Evaluate the tuned model
print(f'Accuracy after tuning: {accuracy_score(y_val_split, y_pred_best)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred_best))
print('Classification Report:')
print(classification_report(y_val_split, y_pred_best))

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')

Interpretability

Feature importance

In [None]:
# Get feature importance
feature_importances = best_rf_model.feature_importances_

# Create a DataFrame for the feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance in Random Forest Model')
plt.show()

### XGBoost

Preprocessing

- Missing values: already delt with
- One hot encoding
- featuyre scaling ?
- Train-test split

In [244]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [245]:
# One-hot encode categorical features for train and test sets
encoded_train_df = one_hot_enc_choose_cat(train_df)
encoded_test_df = one_hot_enc_choose_cat(test_df)

# Split the target and features in the training data
X_train = encoded_train_df.drop(columns=['Survived'])
y_train = encoded_train_df['Survived']

X_test = encoded_test_df

# Split the training data into a train and validation set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Ensure the target variable is numeric (integer)
y_train_split = y_train_split.astype(int)
y_val_split = y_val_split.astype(int)


In [246]:
# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Fit the model on the training data
xgb_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_pred = xgb_model.predict(X_val_split)

In [None]:
# Evaluate the model
print(f'Accuracy: {accuracy_score(y_val_split, y_pred)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred))
print('Classification Report:')
print(classification_report(y_val_split, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_split, y_train_split)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Predict on the validation set using the best model
y_pred_best = best_xgb_model.predict(X_val_split)

# Evaluate the tuned model
print(f'Accuracy after tuning: {accuracy_score(y_val_split, y_pred_best)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred_best))
print('Classification Report:')
print(classification_report(y_val_split, y_pred_best))

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')


Interpretability

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importance from the best model
feature_importance = best_xgb_model.feature_importances_

# Create a DataFrame for the feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('XGBoost Feature Importance')
plt.show()


### KNN

Preprocessing

- Missing values: already delt with
- One hot encoding
- scaling
- Train-test split 

In [250]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [251]:
# One-hot encode categorical features using your existing function
encoded_train_df = one_hot_enc_choose_cat(train_df)
encoded_test_df = one_hot_enc_choose_cat(test_df)

# Scaling the features
scaler = StandardScaler()
encoded_train_df[numerical_features] = scaler.fit_transform(encoded_train_df[numerical_features])
encoded_test_df[numerical_features] = scaler.transform(encoded_test_df[numerical_features])


# Split the target and features in the training data
X_train = encoded_train_df.drop(columns=['Survived'])
y_train = encoded_train_df['Survived'].astype(int)  # Ensure the target is an integer


In [None]:
# Split the training data into a train and validation set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Initialize the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Fit the model on the training data
knn_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_pred = knn_model.predict(X_val_split)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_val_split, y_pred)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred))
print('Classification Report:')
print(classification_report(y_val_split, y_pred))

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13],
    'weights': ['uniform', 'distance'],  # Consider uniform or distance-based weighting
    'metric': ['euclidean', 'manhattan']  # Different distance metrics
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_split, y_train_split)

# Get the best model
best_knn_model = grid_search.best_estimator_

# Predict on the validation set using the best model
y_pred_best = best_knn_model.predict(X_val_split)

# Evaluate the tuned model
print(f'Accuracy after tuning: {accuracy_score(y_val_split, y_pred_best)}')
print('Confusion Matrix:')
print(confusion_matrix(y_val_split, y_pred_best))
print('Classification Report:')
print(classification_report(y_val_split, y_pred_best))

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')

Interpretability

In [None]:
import lime
import lime.lime_tabular

# Create a LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_split, feature_names=X_train.columns, class_names=['Died', 'Survived'], discretize_continuous=True)

# Select a single instance from the validation set
instance = X_val_split[0].reshape(1, -1)

# Generate explanation for that instance
exp = explainer.explain_instance(instance.flatten(), best_knn_model.predict_proba)
exp.show_in_notebook(show_all=False)

ameliorer le tuning, + ecrire une synthese de ce que fait chaque technique.

Comparer les perf de chaque modele, et les feature importance.
puis choisir le meilleur pour la soumission.

### Clustering ?  

Pour trouver plus d'info sur les features qui separe les groupes, ou comment les groupes sont séparés.

voir SHAP values, ROC curves, + d'hypermarameter tuning