In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

from category_encoders import CatBoostEncoder
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import optuna


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Import Dataset

In [23]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_train.head()
#df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Setting Features and Label

In [25]:
X = df_train.drop(columns=['PassengerId', 'Name', 'Survived'])
y = df_train['Survived']
X.head()



Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,female,35.0,1,0,113803,53.1,C123,S
4,3,male,35.0,0,0,373450,8.05,,S


# Test Set# 

In [27]:
test_Id = df_test['PassengerId']
test_df = df_test.drop(columns=['PassengerId', 'Name'])
test_df.head()
#X_df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,34.5,0,0,330911,7.8292,,Q
1,3,female,47.0,1,0,363272,7.0,,S
2,2,male,62.0,0,0,240276,9.6875,,Q
3,3,male,27.0,0,0,315154,8.6625,,S
4,3,female,22.0,1,1,3101298,12.2875,,S


# Misisng Values

In [28]:
numerical_columns = X.select_dtypes(include=[np.number]).columns
categorical_columns = X.select_dtypes(include=['object']).columns

X[numerical_columns] = X[numerical_columns].fillna(-1)
X[categorical_columns] = X[categorical_columns].fillna("No Attribute")

test_df[numerical_columns] = test_df[numerical_columns].fillna(-1)
test_df[categorical_columns] = test_df[categorical_columns].fillna("No Attribute")



# Split The Training Data

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Defining Preprocessing Steps

In [17]:
numeric_transformers = Pipeline(steps=[('scaling', StandardScaler())])
categorical_transformers = Pipeline(steps=[('catboosting', CatBoostEncoder(cols=categorical_columns, random_state=0))])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformers, numerical_columns),
                                               ('cat', categorical_transformers, categorical_columns)])


# Define Optuna

In [18]:
def objective(trial):
    xgb_params = {
        "learning_rate": trial.suggest_float("xgb_learning_rate",0.0001,0.1, log=True),
        "max_depth": trial.suggest_int("xgb_max_depth",3,12),
        "subsample": trial.suggest_float("xgb_subsample",0.5,1.0),
        "colsample_bytree": trial.suggest_float("xgb_colsample_bytree",0.5,1.0),
        "n_estimators": trial.suggest_int("xgb_n_estimators",50,300),
    }

    cat_params = {
        "learning_rate": trial.suggest_float("cat_learning_rate",0.0001,0.1, log=True),
        "depth": trial.suggest_int("cat_depth",3,10),
        "iterations": trial.suggest_int("cat_iterations",100,500),
        "l2_leaf_reg": trial.suggest_float("cat_l2_leaf_reg",0.0001,0.1, log=True),
        "subsample": trial.suggest_float("cat_subsample",0.5,1.0),
        "random_strength": trial.suggest_float("cat_random_strength",0.0001,0.1),
        
    }
    # Creating the Classifiers
    xgb = XGBClassifier(**xgb_params, use_label_encoder=False, eval_metric='logloss')
    cat = CatBoostClassifier(**cat_params, verbose=0)

    #Creating a Voting Classifier
    pipeline= Pipeline([('preprocessor', preprocessor),
                        ('voting_classifier', VotingClassifier(estimators=[('xgb',xgb),('cat',cat)], voting='hard'))])
    #Perfoming Cross_validation
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy').mean()
    return score



# Running Optuna Optimizer

In [19]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

#Getting the best parameters
best_params = study.best_params
print(best_params)

[I 2025-03-16 12:24:27,671] A new study created in memory with name: no-name-639b1d07-b84f-4423-a1a4-945a35a1c944
[I 2025-03-16 12:24:32,469] Trial 0 finished with value: 0.8145671230178271 and parameters: {'xgb_learning_rate': 0.09637188502123784, 'xgb_max_depth': 12, 'xgb_subsample': 0.8496111933914645, 'xgb_colsample_bytree': 0.9229776863096839, 'xgb_n_estimators': 227, 'cat_learning_rate': 0.0015784550158023034, 'cat_depth': 7, 'cat_iterations': 315, 'cat_l2_leaf_reg': 0.00015594707927699567, 'cat_subsample': 0.5743429424915091, 'cat_random_strength': 0.07888821468614414}. Best is trial 0 with value: 0.8145671230178271.
[I 2025-03-16 12:24:35,333] Trial 1 finished with value: 0.8047276666994977 and parameters: {'xgb_learning_rate': 0.007988972181996224, 'xgb_max_depth': 10, 'xgb_subsample': 0.7130754320146284, 'xgb_colsample_bytree': 0.9254627252780638, 'xgb_n_estimators': 235, 'cat_learning_rate': 0.0010893559454883712, 'cat_depth': 5, 'cat_iterations': 205, 'cat_l2_leaf_reg': 0.0

{'xgb_learning_rate': 0.04715531384031086, 'xgb_max_depth': 11, 'xgb_subsample': 0.6842012541367772, 'xgb_colsample_bytree': 0.7063265941151023, 'xgb_n_estimators': 264, 'cat_learning_rate': 0.002058033302219121, 'cat_depth': 10, 'cat_iterations': 316, 'cat_l2_leaf_reg': 0.05716294723972189, 'cat_subsample': 0.7103322975984013, 'cat_random_strength': 0.05574267025961141}


# Train With Best Parameters> 

In [22]:
best_xgb_params = {
        "learning_rate": study.best_params['xgb_learning_rate'],
        "max_depth": study.best_params['xgb_max_depth'],
        "subsample": study.best_params['xgb_subsample'],
        "colsample_bytree": study.best_params['xgb_colsample_bytree'],
        "n_estimators": study.best_params['xgb_n_estimators'],
}
best_cat_params = {
        "learning_rate": study.best_params['cat_learning_rate'],
        "depth": study.best_params['cat_depth'],
        "iterations": study.best_params['cat_iterations'],
        "l2_leaf_reg": study.best_params['cat_l2_leaf_reg'],
        "subsample": study.best_params['cat_subsample'],
        "random_strength": study.best_params['cat_random_strength'],
}    

#Final Model
xgb_2 = XGBClassifier(**best_xgb_params, use_label_encoder=False, eval_metric='logloss')
cat_2 = CatBoostClassifier(**best_cat_params, verbose=0)

pipeline_2 = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier",VotingClassifier(estimators=[('xgb', xgb_2),
                                               ('cat', cat_2)],
                                   voting='hard'))
])

pipeline_2.fit(X_train,y_train)
test_score = pipeline_2.score(X_test,y_test)
print(f"The Model Accuracy is {test_score}")

The Model Accuracy is 0.8268156424581006


# Predicitions For The Test set

In [30]:
y_pred = pipeline_2.predict(test_df)

result = pd.DataFrame()
result['PassengerId'] = test_Id
result['Survived'] = y_pred
result.to_csv('submission.csv', index=False)