In [2]:
import pandas as pd
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from pycaret.classification import *

In [6]:
titanic_data = pd.read_csv('titanic/train.csv')
test_data = pd.read_csv('titanic/test.csv')

# Get basic info and check for missing values
titanic_data.info()
titanic_data.describe()
titanic_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## **Part1. Feature engineering**

### Step 1.1: Potential Features to Engineer

Based on domain knowledge and literature, here are feature engineering ideas for the Titanic dataset:

1.  **Family Size**: Combine `SibSp` (siblings/spouses aboard) and `Parch` (parents/children aboard) into a single feature. Larger families might have different survival rates compared to individuals traveling alone.
    
    **New Feature**: `FamilySize = SibSp + Parch + 1`
    
2.  **IsAlone**: Create a binary feature to indicate whether a passenger is traveling alone.
    
    **New Feature**: `IsAlone = 1 if FamilySize == 1 else 0`
    
3.  **Title**: Extract titles (Mr., Mrs., Miss, etc.) from the `Name` column, as these often reflect social status and gender.
    
    **New Feature**: Extract title from the `Name` column and group rare titles.
    
4.  **Age Group**: Bucket `Age` into categories such as child, young adult, middle-aged, and senior.
    
    **New Feature**: Categorical `AgeGroup`.
    
5.  **Fare Binning**: Bin the `Fare` column into categories to reduce variance and capture socioeconomic status.
    
    **New Feature**: Categorical `FareBand`.
    
6.  **Cabin Presence**: Create a binary feature to indicate whether a passenger's cabin information is known.
    
    **New Feature**: `HasCabin = 1 if Cabin is not null else 0`
    
7.  **Deck**: Extract the deck information from the `Cabin` column.
    
    **New Feature**: Extract first letter of the `Cabin` (e.g., 'C123' -> 'C').
    
8.  **Embarked**: One-hot encode `Embarked`, which represents the port of embarkation.

### Step 1.2 Apply Feature Engineering

Here’s the implementation of these feature engineering steps:

In [7]:
# Fill missing values in 'Age' and 'Embarked'
titanic_data['Age'] = titanic_data['Age'].fillna(titanic_data['Age'].median())
titanic_data['Embarked'] = titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0])

# Create 'FamilySize' feature
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1

# Create 'IsAlone' feature
titanic_data['IsAlone'] = 1  # Default to alone
titanic_data['IsAlone'].loc[titanic_data['FamilySize'] > 1] = 0

# Extract 'Title' feature from 'Name'
titanic_data['Title'] = titanic_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# Group rare titles
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
titanic_data['Title'] = titanic_data['Title'].replace(rare_titles, 'Rare')
titanic_data['Title'] = titanic_data['Title'].replace(['Mlle', 'Ms'], 'Miss')
titanic_data['Title'] = titanic_data['Title'].replace('Mme', 'Mrs')

# Create 'AgeGroup' feature
titanic_data['AgeGroup'] = pd.cut(titanic_data['Age'], bins=[0, 12, 18, 35, 60, 80], labels=['Child', 'Teenager', 'Young Adult', 'Middle Aged', 'Senior'])

# Bin 'Fare' into categories
titanic_data['FareBand'] = pd.qcut(titanic_data['Fare'], 4, labels=['Low', 'Medium', 'High', 'Very High'])

# Create 'HasCabin' feature
titanic_data['HasCabin'] = titanic_data['Cabin'].notnull().astype(int)

# Extract 'Deck' feature from 'Cabin'
titanic_data['Deck'] = titanic_data['Cabin'].str[0]
titanic_data['Deck'] = titanic_data['Deck'].fillna('Unknown')

# Drop irrelevant columns
titanic_data = titanic_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Convert categorical variables to strings
categorical_cols = ['Sex', 'Embarked', 'Title', 'AgeGroup', 'FareBand', 'Deck']
for col in categorical_cols:
    titanic_data[col] = titanic_data[col].astype(str)

# Display first few rows of the engineered dataset
titanic_data.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,AgeGroup,FareBand,HasCabin,Deck
0,0,3,male,22.0,1,0,7.25,S,2,0,Mr,Young Adult,Low,0,Unknown
1,1,1,female,38.0,1,0,71.2833,C,2,0,Mrs,Middle Aged,Very High,1,C
2,1,3,female,26.0,0,0,7.925,S,1,1,Miss,Young Adult,Medium,0,Unknown
3,1,1,female,35.0,1,0,53.1,S,2,0,Mrs,Young Adult,Very High,1,C
4,0,3,male,35.0,0,0,8.05,S,1,1,Mr,Young Adult,Medium,0,Unknown


## **Part2. Model selection**

## 2.1 Top five algorithm

In [8]:
from pycaret.classification import setup, compare_models, pull

# PyCaret setup for classification
clf_setup = setup(data=titanic_data, target='Survived', session_id=42, verbose=True)

# Compare models and select the top 5
top_models = compare_models(n_select=5)

# Display the top 5 models
top_models_results = pull()  # Pull the comparison DataFrame
print("Top 5 Models:")
print(top_models_results.head(5))

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 15)"
4,Transformed data shape,"(891, 36)"
5,Transformed train set shape,"(623, 36)"
6,Transformed test set shape,"(268, 36)"
7,Numeric features,8
8,Categorical features,6
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8331,0.8669,0.7326,0.8169,0.77,0.64,0.6445,0.06
lightgbm,Light Gradient Boosting Machine,0.8267,0.8659,0.7328,0.8027,0.7628,0.6272,0.632,0.244
rf,Random Forest Classifier,0.8219,0.8596,0.7495,0.783,0.7626,0.6207,0.6242,0.083
ridge,Ridge Classifier,0.8218,0.8633,0.7534,0.7766,0.7634,0.6208,0.6224,0.044
lda,Linear Discriminant Analysis,0.8171,0.863,0.7451,0.7724,0.7569,0.6106,0.6125,0.034
lr,Logistic Regression,0.8154,0.8631,0.7451,0.7707,0.7549,0.6074,0.6103,0.737
ada,Ada Boost Classifier,0.8105,0.8341,0.7453,0.7622,0.75,0.5981,0.602,0.051
et,Extra Trees Classifier,0.8091,0.8342,0.7326,0.7638,0.745,0.593,0.5957,0.076
dt,Decision Tree Classifier,0.7722,0.7513,0.6947,0.7098,0.6991,0.5164,0.5191,0.041
nb,Naive Bayes,0.7576,0.8227,0.7196,0.6797,0.6942,0.4946,0.4999,0.049


Top 5 Models:
                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.8331  0.8669  0.7326  0.8169   
lightgbm  Light Gradient Boosting Machine    0.8267  0.8659  0.7328  0.8027   
rf               Random Forest Classifier    0.8219  0.8596  0.7495  0.7830   
ridge                    Ridge Classifier    0.8218  0.8633  0.7534  0.7766   
lda          Linear Discriminant Analysis    0.8171  0.8630  0.7451  0.7724   

              F1   Kappa     MCC  TT (Sec)  
gbc       0.7700  0.6400  0.6445     0.060  
lightgbm  0.7628  0.6272  0.6320     0.244  
rf        0.7626  0.6207  0.6242     0.083  
ridge     0.7634  0.6208  0.6224     0.044  
lda       0.7569  0.6106  0.6125     0.034  


## 2.2 ensemble model

In [9]:
from pycaret.classification import ensemble_model, blend_models, pull

# Ensemble using bagging
best_bagging_element = top_models[0]
best_bagging_model = ensemble_model(estimator=best_bagging_element, method='Bagging', verbose=False)
best_bagging_results = pull()
best_bagging_accuracy = best_bagging_results.iloc[0]['Accuracy']

for i in range(5):
    bagging_element = top_models[i]
    bagging_model = ensemble_model(bagging_element, method='Bagging', verbose=False)
    bagging_results = pull()
    if bagging_results.iloc[0]['Accuracy'] > best_bagging_accuracy:
        best_bagging_element = bagging_element
        best_bagging_model = bagging_model
        best_bagging_results = pull()
        best_bagging_accuracy = best_bagging_results.iloc[0]['Accuracy']
print("Bagging Model Performance:")
print(best_bagging_results.head())
print(best_bagging_element)

# Ensemble using boosting
best_boosting_element = top_models[0]
best_boosting_model = ensemble_model(estimator=best_boosting_element, method='Boosting', verbose=False)
best_boosting_results = pull()
best_boosting_accuracy = best_boosting_results.iloc[0]['Accuracy']
for i in range(5):
    try:
        boosting_element = top_models[i]
        boosting_model = ensemble_model(estimator=boosting_element, method='Boosting', verbose=False)
        boosting_results = pull() 
        if boosting_results.iloc[0]['Accuracy'] > best_boosting_accuracy:
            best_boosting_element = boosting_element
            best_boosting_model = boosting_model
            best_boosting_results = pull()
            best_boosting_accuracy = best_boosting_results.iloc[0]['Accuracy']
    except:
        print(Exception)
        continue
print("\nBoosting Model Performance:")
print(best_boosting_results.head())
print(best_boosting_element)
        

# Ensemble using blending
blending_model = blend_models(top_models, verbose=False)
print("\nBlending Model Performance:")
blending_results = pull()
print(blending_results.head())


# Compare the three ensemble models
print("Comparing Ensemble Models:")
print(pd.DataFrame({
    'Bagging': best_bagging_results.iloc[0],
    'Boosting': best_boosting_results.iloc[0],
    'Blending': blending_results.iloc[0]
}))




Bagging Model Performance:
      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.8730  0.9092  0.8333  0.8333  0.8333  0.7308  0.7308
1       0.8571  0.9145  0.7917  0.8261  0.8085  0.6947  0.6951
2       0.8571  0.8734  0.7083  0.8947  0.7907  0.6845  0.6952
3       0.8710  0.9164  0.8261  0.8261  0.8261  0.7235  0.7235
4       0.8387  0.8586  0.6250  0.9375  0.7500  0.6379  0.6664
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=100, n_iter_no_change=None,
                           random_state=42, subsample=1.0, tol=0.0001,
                   

## **Part3. Hyper parameter optimization**

In [10]:
from pycaret.classification import tune_model

# Perform hyperparameter optimization on the bagged model
optimized_bagged_model = tune_model(best_bagging_model, optimize='Accuracy', n_iter=50)  # n_iter can be adjusted for more trials

# Display the optimized model
print("\nOptimized Bagged Model:")
print(optimized_bagged_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8571,0.9167,0.7917,0.8261,0.8085,0.6947,0.6951
1,0.8413,0.9049,0.75,0.8182,0.7826,0.658,0.6595
2,0.873,0.8884,0.75,0.9,0.8182,0.7219,0.7289
3,0.871,0.9231,0.8261,0.8261,0.8261,0.7235,0.7235
4,0.8548,0.8662,0.7083,0.8947,0.7907,0.6819,0.6928
5,0.9194,0.9079,0.8333,0.9524,0.8889,0.826,0.8306
6,0.8387,0.8794,0.75,0.8182,0.7826,0.6548,0.6564
7,0.7903,0.7796,0.625,0.7895,0.6977,0.5405,0.5491
8,0.871,0.9024,0.7917,0.8636,0.8261,0.7238,0.7256
9,0.7742,0.8454,0.6667,0.7273,0.6957,0.5167,0.5179


Fitting 10 folds for each of 50 candidates, totalling 500 fits

Optimized Bagged Model:
BaggingClassifier(bootstrap=True, bootstrap_features=False,
                  estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='log_loss',
                                                       max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                      

### Model Performance After Hyperparameter Optimization

#### Cross-Validation Metrics

The bagged ensemble model was evaluated using 10-fold cross-validation. The key metrics are:

| Metric | Mean | Std Deviation |
| --- | --- | --- |
| **Accuracy** | 0.8491 | 0.0397 |
| **AUC** | 0.8814 | 0.0408 |
| **Recall** | 0.7493 | 0.0636 |
| **Precision** | 0.8416 | 0.0603 |
| **F1 Score** | 0.7917 | 0.0557 |
| **Kappa** | 0.6742 | 0.0860 |
| **MCC** | 0.6779 | 0.0859 |

**Key Observations**:

-   The **Accuracy** of the optimized bagged model is **84.91%**, indicating good generalization across the folds.
-   The **AUC (88.14%)** highlights the model's excellent ability to distinguish between the classes.
-   The **Recall (74.93%)** shows that the model is fairly sensitive in identifying positive cases (e.g., survivors).
-   A **Precision (84.16%)** indicates that most of the predicted positive cases are correct.
-   **F1 Score (79.17%)** balances precision and recall effectively.

___

### Optimized Bagged Model Configuration

The optimized bagged model uses a `BaggingClassifier` with a base estimator of `GradientBoostingClassifier`. Key hyperparameters tuned during optimization include:

1.  **Base Estimator (`GradientBoostingClassifier`)**:
    
    -   Learning Rate: 0.1
    -   Max Depth: 3
    -   Number of Estimators: 100
    -   Loss Function: `log_loss`
    -   Subsample: 1.0
2.  **Bagging Hyperparameters**:
    
    -   `max_samples`: 0.8 (80% of the data used per bagging iteration).
    -   `max_features`: 0.7 (70% of features used per bagging iteration).
    -   Number of Bagging Estimators: 10.

These configurations ensure a balance between model complexity and generalization.

___

### Conclusion and Recommendations

1.  **Performance**: The optimized bagged ensemble model achieves high accuracy and AUC, indicating robust performance for the Titanic survival prediction task.
2.  **Hyperparameter Tuning Impact**: Fine-tuning improved the model's performance compared to default settings, especially by optimizing the bagging and base estimator parameters.
3.  **Future Enhancements**:
    -   Additional feature engineering may further improve model performance.
    -   Evaluate the optimized model on an unseen test set to confirm its generalization ability.

In [18]:
test_data = pd.read_csv('titanic/test.csv')

# Apply the same preprocessing and feature engineering steps to the test dataset
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
test_data['IsAlone'] = (test_data['FamilySize'] == 1).astype(int)
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_data['Title'] = test_data['Title'].replace(['Mlle', 'Ms'], 'Miss').replace('Mme', 'Mrs')
test_data['Title'] = test_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don',
                                                 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test_data['AgeGroup'] = pd.cut(test_data['Age'], bins=[0, 12, 18, 35, 60, 80], labels=['Child', 'Teenager', 'Young Adult', 'Middle Aged', 'Senior'])
test_data['FareBand'] = pd.qcut(test_data['Fare'], 4, labels=['Low', 'Medium', 'High', 'Very High'])
test_data['HasCabin'] = test_data['Cabin'].notnull().astype(int)
test_data['Deck'] = test_data['Cabin'].str[0].fillna('Unknown')

# Fill missing values
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].median())

# Drop unnecessary columns
test_data = test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

# Ensure the test data matches the format of the training data
categorical_cols = ['Sex', 'Embarked', 'Title', 'AgeGroup', 'FareBand', 'Deck']
for col in categorical_cols:
    test_data[col] = test_data[col].astype(str)

# Make predictions using the optimized ensemble model
test_predictions = predict_model(optimized_bagged_model, data=test_data)

# Extract the 'Survived' column for submission
submission = test_data[['PassengerId']].copy()
submission['Survived'] = test_predictions['prediction_label']  # Assuming PyCaret outputs predictions in the 'Label' column

# Save the predictions to a CSV file
submission.to_csv('titanic_submission.csv', index=False)
print("Submission file created: titanic_submission.csv")

Submission file created: titanic_submission.csv
