### One Hot Encoding for Consistent Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [12]:
# Load the dataset
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv'
df = pd.read_csv(url)

# Extract features and target
X = df.drop(columns=['survived', 'alive','pclass'])
y = df['survived']
# declare pclass and survived as categorical
y = y.astype('category')

In [13]:
# One-hot encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

In [14]:
# Fill missing data using KNN imputation
imputer = KNNImputer(n_neighbors=5)  # You can set n_neighbors to the desired value
X_knn_imputed = pd.DataFrame(imputer.fit_transform(X_encoded), columns=X_encoded.columns)

In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_knn_imputed, y, test_size=0.3, random_state=42, stratify=y)

# Initialize and train the model
model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Not Survived', 'Survived'])
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)


Accuracy: 0.8209
Classification Report:
              precision    recall  f1-score   support

Not Survived       0.85      0.85      0.85       165
    Survived       0.77      0.77      0.77       103

    accuracy                           0.82       268
   macro avg       0.81      0.81      0.81       268
weighted avg       0.82      0.82      0.82       268



In [16]:
from catboost import CatBoostClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_knn_imputed, y, test_size=0.3, random_state=42)

# Initialize and train the model
model = CatBoostClassifier(random_state=42, eval_metric='Logloss', verbose=0)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Not Survived', 'Survived'])
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)


Accuracy: 0.8097
Classification Report:
              precision    recall  f1-score   support

Not Survived       0.80      0.90      0.85       157
    Survived       0.83      0.68      0.75       111

    accuracy                           0.81       268
   macro avg       0.82      0.79      0.80       268
weighted avg       0.81      0.81      0.81       268



In [17]:
from sklearn.impute import KNNImputer, SimpleImputer
from xgboost import XGBClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_knn_imputed, y, test_size=0.3, random_state=42)

# Initialize and train the model
model = XGBClassifier(random_state=42, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Not Survived', 'Survived'])
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)



Accuracy: 0.7836
Classification Report:
              precision    recall  f1-score   support

Not Survived       0.80      0.83      0.82       157
    Survived       0.75      0.71      0.73       111

    accuracy                           0.78       268
   macro avg       0.78      0.77      0.78       268
weighted avg       0.78      0.78      0.78       268



In [18]:
import lightgbm as lgb

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_knn_imputed, y, test_size=0.3, random_state=42, stratify=y)

# Initialize and train the model
model = lgb.LGBMClassifier(random_state=42, eval_metric='logloss', verbosity=-1)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Not Survived', 'Survived'])
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)


[LightGBM] [Info] Number of positive: 239, number of negative: 384
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Accuracy: 0.7985
Classification Report:
              precision    recall  f1-score   support

Not Survived       0.81      0.87      0.84       165
    Survived       0.77      0.68      0.72       103

    accuracy                           0.80       268
   macro avg       0.79      0.78      0.78       268
weighted avg       0.80      0.80      0.80       268



In [19]:
from sklearn.ensemble import RandomForestClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_knn_imputed, y, test_size=0.3, random_state=42, stratify=y)

# Initialize and train the model
model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Not Survived', 'Survived'])
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)



Accuracy: 0.8060
Classification Report:
              precision    recall  f1-score   support

Not Survived       0.82      0.88      0.85       165
    Survived       0.79      0.68      0.73       103

    accuracy                           0.81       268
   macro avg       0.80      0.78      0.79       268
weighted avg       0.80      0.81      0.80       268



### Summary of the machine learning Pupformance

In [10]:
# create a dataframe with the accuracy of the models, logistic regression, catboost, xgboost, lightgbm and random forest
df_summary = pd.DataFrame({'Model': ['Logistic Regression', 'CatBoost', 'XGBoost', 'LightGBM', 'Random Forest'],
                     'Accuracy_wo_OneHot': [0.8209, 0.8097, 0.8097, 0.8172, 0.8172],
                     'Accuracy_wt_OneHot': [0.8209, 0.8097, 0.7836, 0.7985, 0.8060],})

df_summary

Unnamed: 0,Model,Accuracy_wo_OneHot,Accuracy_wt_OneHot
0,Logistic Regression,0.8209,0.8209
1,CatBoost,0.8097,0.8097
2,XGBoost,0.8097,0.7836
3,LightGBM,0.8172,0.7985
4,Random Forest,0.8172,0.806
