In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Smote
from imblearn.over_sampling import SMOTE
from collections import Counter

# Machine Learning
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

import xgboost as xgb
import lightgbm as lgb

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# EDA with pandas-profiling
import ydata_profiling as pdp

# Monitoring
from tqdm import tqdm
import time

# Miscellaneous
import warnings
warnings.filterwarnings('ignore')

# Regex
import re

# Load your dataset
df = pd.read_csv("titanic/train.csv")

# # Generate a profiling report
# profile = pdp.ProfileReport(df, title="Pandas Profiling Report")
# profile.to_file("pandas_profiling_report.html")  # Save report as HTML

## Data Preparation

## Data Preparation

### Data Cleaning

In [None]:
# Load your dataset
df = pd.read_csv("titanic/train.csv")



# Drop the specified columns from the DataFrame
df = df.drop(columns=['PassengerId', 
                      'Ticket',  
                      'Cabin'])


#### Missing Values

In [None]:
# Calculate the number of missing values in each column
missing_count = df.isnull().sum()

# Calculate the percentage of missing values in each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Combine the count and percentage into a DataFrame
df_missing = pd.DataFrame({
    'Missing Count': missing_count,
    'Missing Percentage': missing_percentage
})

# Filter the DataFrame to include only columns with missing percentage greater than 0
df_missing = df_missing[df_missing['Missing Percentage'] > 0]

# Optionally, sort the DataFrame by the number of missing values (descending order)
df_missing = df_missing.sort_values(by='Missing Count', ascending=False)

# Display the DataFrame
print(df_missing)


# Fill missing values in 'Embarked' with the most frequent value
most_frequent_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(most_frequent_embarked, inplace=True)

# Fill missing values in 'Age' with the median value
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)


#### Outlier

In [None]:
# Select only numeric columns from the DataFrame
numeric_columns = df.select_dtypes(include=['number'])

# Calculate the skewness for each numeric column in the DataFrame
skewness_values = numeric_columns.skew()

# Create a DataFrame to store the skewness values
df_skew = pd.DataFrame({
    'Column': skewness_values.index,
    'Skewness': skewness_values.values
})

# Optionally, sort the DataFrame by skewness in descending order
df_skew = df_skew.sort_values(by='Skewness', ascending=False)

# Display the DataFrame with skewness values
print("Skewness of numeric columns:")
print(df_skew)

# Define columns for which to apply capping based on high skewness
columns_to_cap = df_skew[df_skew['Skewness'] > 1]['Column']

# Apply upper capping at the 95th percentile for columns with high skewness
for col in columns_to_cap:
    upper_cap = df[col].quantile(0.95)  # Calculate the 95th percentile
    df[col] = np.where(df[col] > upper_cap, upper_cap, df[col])  # Cap values above the 95th percentile

# Display the new skewness values after capping
print("\nNew skewness after capping:")
print(df[columns_to_cap].skew())


#### Duplicates

In [None]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]

# Display the duplicate rows
print("Duplicate Rows:")
print(duplicate_rows)


### Feature Engineering

In [None]:
# Define a comprehensive regex pattern to search for a variety of titles
title_pattern = r'\b(Dr|Prof|Ph\.D\.|M\.Sc\.|B\.Sc\.|M\.A\.|B\.A\.|MBA|MD|DDS|DVM|JD|LLD|Sir|Dame|Lord|Lady|Baron|Baroness|Rev\.|Father|Sister|Capt|Col|Major|Lt|Sgt|Admiral|General|Eng\.|Architect|Attorney)\b'

# Apply the pattern to the Name column to extract titles
df['Academic Title'] = df['Name'].apply(lambda x: re.search(title_pattern, x))

# Create a new column 'Title' where 1 indicates the presence of a title and 0 indicates no title
df['Title'] = df['Academic Title'].apply(lambda x: 1 if x else 0)

# Drop the temporary 'Academic Title' column if it's no longer needed
df = df.drop(columns=['Academic Title', 'Name'])


In [None]:
# Create the FamilySize feature
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1  # Adding 1 to include the passenger themselves

# Display the first few rows to verify the new feature
print(df[['SibSp', 'Parch', 'FamilySize']].head())


# df = df.drop(columns=['SibSp', 'Parch'])


### Data Transformation

##### Categorical to Numerical

In [None]:
# Select only categorical columns from the DataFrame
df['Pclass'] = df['Pclass'].astype('category')
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

# Display the names of all categorical columns
print("Categorical Variables:")
print(categorical_columns)

# Label encode the 'Sex' column
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])

# One-hot encode the 'Pclass' and 'Embarked' columns
df = pd.get_dummies(df, columns=['Pclass', 'Embarked'], drop_first=False)


# Step 1: Convert all boolean columns to integers (0 and 1)
df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))

# Step 2: Ensure all columns are numeric
df = df.apply(pd.to_numeric)



##### Normalization / Standardization

In [None]:
# List of numeric columns to standardize
numeric_columns = ['Age', 'Fare', 'FamilySize']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the numeric columns and transform them
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [None]:
df

#### SMOTE

In [None]:

# Assuming 'Survived' is the target variable
X = df.drop(columns=['Survived'])  # Features
y = df['Survived']  # Target


print("Original class distribution:", Counter(y))
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the new class distribution
print("Class distribution after SMOTE:", Counter(y_train_smote))


##### 

## Model Selection

#### Initalizing Machine Learning Models

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),  # Increased max_iter for convergence
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': lgb.LGBMClassifier()
}

#### Cross-Validation

In [None]:
# List to store the results
results_list = []

# Perform cross-validation for each model
for model_name, model in tqdm(models.items(), desc="Running models"):
    # Perform cross-validation for multiple metrics in one run
    cv_results = cross_validate(model, X, y, cv=5, scoring=['accuracy', 'roc_auc', 'f1'])
    
    # Append results to the list
    results_list.append({
        'Model': model_name,
        'Accuracy Mean': cv_results['test_accuracy'].mean(),
        'Accuracy Std': cv_results['test_accuracy'].std(),
        'ROC AUC Mean': cv_results['test_roc_auc'].mean(),
        'ROC AUC Std': cv_results['test_roc_auc'].std(),
        'F1 Score Mean': cv_results['test_f1'].mean(),
        'F1 Score Std': cv_results['test_f1'].std()
    })

# Convert the results list into a DataFrame
results_df = pd.DataFrame(results_list)

# Display the DataFrame
print(results_df)

In [None]:
results_df

## Hyperparameter Tuning

#### SVM

In [None]:
# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

# Initialize the GridSearchCV object for SVM
grid_search_svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5, scoring='accuracy', verbose=2)

# Fit the model
grid_search_svm.fit(X, y)

# Best parameters and best score
print("Best parameters for SVM:", grid_search_svm.best_params_)
print("Best cross-validation accuracy for SVM:", grid_search_svm.best_score_)


#### Gradient Boosting

In [None]:
# Define the parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize the GridSearchCV object for Gradient Boosting
grid_search_gb = GridSearchCV(GradientBoostingClassifier(), param_grid_gb, cv=5, scoring='accuracy', verbose=2)

# Fit the model
grid_search_gb.fit(X, y)

# Best parameters and best score
print("Best parameters for Gradient Boosting:", grid_search_gb.best_params_)
print("Best cross-validation accuracy for Gradient Boosting:", grid_search_gb.best_score_)


#### XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Initialize the GridSearchCV object for XGBoost
grid_search_xgb = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid_xgb, cv=5, scoring='accuracy', verbose=2)

# Fit the model
grid_search_xgb.fit(X, y)

# Best parameters and best score
print("Best parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best cross-validation accuracy for XGBoost:", grid_search_xgb.best_score_)


####  LightGMB

In [None]:
# Define the parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'num_leaves': [31, 40, 50],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Initialize the GridSearchCV object for LightGBM
grid_search_lgb = GridSearchCV(
    lgb.LGBMClassifier(), 
    param_grid_lgb, 
    cv=5, 
    scoring='accuracy', 
    verbose=2
)

# Fit the model with GridSearchCV
grid_search_lgb.fit(X, y)

# Output the best parameters and best cross-validation accuracy
print("Best parameters for LightGBM:", grid_search_lgb.best_params_)
print("Best cross-validation accuracy for LightGBM:", grid_search_lgb.best_score_)

## Deep Learning

### Neural Network

In [None]:
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam



# Initialize the model
model = Sequential()

# Input layer and first hidden layer with Batch Normalization and Dropout
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))

# Second hidden layer with Batch Normalization and Dropout
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))

# Third hidden layer with Batch Normalization and Dropout
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

In [None]:
# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, 
                    validation_data=(X_val, y_val),
                    epochs=100,
                    batch_size=32,
                    callbacks=[early_stopping],
                    verbose=2)


# Predict on the validation set
y_pred_val = (model.predict(X_val) > 0.5).astype("int32")

# Calculate evaluation metrics
accuracy = accuracy_score(y_val, y_pred_val)
roc_auc = roc_auc_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation ROC AUC: {roc_auc:.4f}")
print(f"Validation F1 Score: {f1:.4f}")
