##### Probelm Definition

##### `Goal`: Predict whether a passenger survived the Titanic disaster (binary classification).

##### `Business Value`: Such modeling mimics real-world survival analysis → can inform safety planning, risk prediction.

##### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, classification_report,
    confusion_matrix, roc_curve
)

from xgboost import XGBClassifier
from xgboost import plot_importance


- Features include:

    - PassengerId, Name, Age, Sex, Pclass (ticket class), Fare, Cabin, Embarked, SibSp, Parch, Survived (target).

##### Data Collection

In [None]:
# Load Dataset
df = pd.read_csv("dataset/train.csv")  
df.head()

##### Data Inspection

In [None]:
# Datatype info
df.info()

In [None]:
# Statistical Summary
print("Shape:", df.shape)
df.describe()

In [None]:
# Checking Missing data
missing_data = []
for column_name, null_rows in df.isnull().sum().items():
    if null_rows > 0:
        perc = null_rows / len(df) * 100
        missing_data.append([column_name, null_rows, perc])

missing_df = pd.DataFrame(missing_data, columns=["Column", "Missing Values", "Percentage"])

missing_df

- Dataset contains 891 records with 12 columns
- Features include Passenger details (Name, Sex, Age, Fare, Pclass, etc.).
- Datatypes: mixture of numeric (int, float) and categorical (object).
- Missing values in Age, Cabin, and Embarked.

##### Data Preprocessing

In [None]:
# Handling Missing data
df["Age"] = SimpleImputer(strategy="median").fit_transform(df[["Age"]])
df["Fare"] = SimpleImputer(strategy="median").fit_transform(df[["Fare"]])

df["Embarked"] = SimpleImputer(strategy="most_frequent").fit_transform(df[["Embarked"]])[:,0]

# Checking missing data
df.isnull().sum()

In [None]:
# outliers

cat_col = "Sex"

exclude_cols = ["PassengerId", "Pclass", "Fare"]
num_cols = [col for col in df.select_dtypes(include="number").columns if col not in exclude_cols]

melted_df = df.melt(id_vars=cat_col, value_vars=num_cols, var_name="NumericColumn", value_name="Value").dropna()
custom_palette = {'male': '#4E79A7', 'female': '#FF6B6B'} 
plt.figure(figsize=(6, 4))
sns.boxplot(x="NumericColumn", y="Value", hue=cat_col, data=melted_df, palette=custom_palette)
plt.title(f"Boxplots of Numeric Columns grouped by {cat_col}")
plt.xticks(rotation=45)
plt.show()

- Missing values imputaion
    - Age, Fare &rarr; median since they are numeric and skewed.
    - Embarked &rarr; most frequent since only 2 values missing.
- Outliers found in Age column for gender group Male

##### Feature Engineering

In [None]:
def extract_title(name: str):
    import re
    m = re.search(r",\s*([^\.]+)\.", name)
    return m.group(1).strip() if m else "None"

df["Title"] = df["Name"].apply(extract_title)
title_map = {
    "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs",
    "Lady": "Royalty", "Countess": "Royalty", "Sir": "Royalty",
    "Jonkheer": "Royalty", "Don": "Royalty", "Dona": "Royalty",
    "Dr": "Officer", "Rev": "Officer", "Col": "Officer",
    "Major": "Officer", "Capt": "Officer"
}
df["Title"] = df["Title"].replace(title_map)

# Family features (family size, alone indicator)
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

# Age groups
def age_group(age):
    if age < 18:
        return "Child"
    elif age < 60:
        return "Adult"
    else:
        return "Senior"

df["AgeGroup"] = df["Age"].apply(age_group)

# Simplify Cabin → deck letter
df["Deck"] = df["Cabin"].astype(str).str[0]
df["Deck"] = df["Deck"].replace("n", "Unknown")
df

- Title Extraction: passenger titles (Mr, Mrs, Miss, Master) from the Name column
- New Features:
    - FamilySize: captures group travel with siblings, spouse, parents and children.
    - IsAlone: binary indicator if the passenger was alone.
- Grouped Age into categories (child, adult, senior)
- Simplify Cabin by using only the first letter (deck)

##### Exploratory Data Analysis (EDA)

In [None]:
# Passenger distribution in each categories

categories = ["Survived", "Pclass", "Sex", "SibSp", "Parch", "Embarked"]
palettes = ['Set2', 'husl', 'Set1', 'Set3', 'Paired', 'Pastel1']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))  
axes = axes.flatten()

total = len(df)

for i, category in enumerate(categories):
    ax = sns.countplot(data=df, x=category, hue=category, palette=palettes[i], dodge=False, legend=False, ax=axes[i])
    ax.set_title(f"Distribution of {category}")
    
    for p in ax.patches:
        height = p.get_height()
        if height > 0:
            percentage = 100 * height / total
            ax.annotate(f'{percentage:.1f}%',
                        (p.get_x() + p.get_width() / 2., height),
                        ha='center', va='bottom', fontsize=8, color='black',
                        xytext=(0, 3), textcoords='offset points')

# fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()



In [None]:
# Gender percentage

gender_df =pd.concat(
    [
        df["Sex"].value_counts(), 
        df["Sex"].value_counts(normalize=True).mul(100).round(2)
    ], axis=1, keys=["Total", "Percentage"]).reset_index()
display(gender_df)

plt.figure(figsize=(5, 4))

ax = sns.barplot(data=gender_df, x='Sex', y='Percentage', hue ='Sex', palette="Set1")

for bar, total, percentage in zip(ax.patches,  gender_df["Total"], gender_df["Percentage"]):
    ax.text(
       x =  bar.get_x() + bar.get_width() / 2.0,
       y =  bar.get_height() + 0.02,
       s = f"{total} ({percentage}%)",
       ha='center', va='bottom'
    )

plt.title('Total and Percentage of Males vs Females')
plt.ylim(0, 110)
plt.ylabel("Percentage")
plt.xlabel("Sex")
plt.show()

- Out of 891 total passengers on board
    - The majority of passengers were men 65% 
    - Female passengers were only 35%

In [None]:
# Survival Rate Plot

def visualize_survival_rate(plot_df, attr, ax, order=None, xticks=None, title=None, palette=custom_palette):
    survived_df = plot_df[plot_df['Survived'] == 1]    
    survival_stats =pd.concat([
        survived_df[attr].value_counts(),
        survived_df[attr].value_counts(normalize=True).mul(100).round(2)
        ], axis=1, keys=["Total", "Percentage"]).reset_index()

    # plt.figure(figsize=(6, 5))
    sns.barplot(x=attr, y='Total', data=survival_stats, palette=palette, order=order, ax=ax)

    for bar, total, perc in zip(ax.patches, survival_stats['Total'], survival_stats['Percentage']):
        ax.text(
            x = bar.get_x() + bar.get_width() / 2,
            y = bar.get_height() + 1,
            s = f"{total} ({perc}%)",
            ha='center',
            va='bottom'
        )

    ax.set_title(title)
    ax.set_ylabel("Number of Survivors")
    ax.set_xlabel(attr)
    if xticks is not None:
        positions, labels = xticks
        ax.set_xticks(positions)
        ax.set_xticklabels(labels)

    ax.set_ylim(0, survival_stats['Total'].max() * 1.15)


In [None]:
# Visualization
import warnings
warnings.filterwarnings("ignore")

fig, axes = plt.subplots(3, 2, figsize=(14, 15))
axes = axes.flatten()

# Survival by Gender
visualize_survival_rate(df, attr="Sex", ax=axes[0], title="Survival Rate by Gender")

# Survival by Ticket Class
visualize_survival_rate(df, attr="Pclass", ax=axes[1], title="Survival Rate by Ticket Class", palette="Set3")

# Survival by Age Group
visualize_survival_rate(df, attr="AgeGroup", ax=axes[2], order=["Child","Adult","Senior"], title="Survival Rate by Age Group", palette='Set2')

# Survival by Embarkation Port
visualize_survival_rate(df, attr="Embarked", ax=axes[3], title="Survival Rate by Embarkation Port", palette="Set1")

# Alone vs Family
df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
visualize_survival_rate(df, attr="IsAlone", ax=axes[4], xticks=([0,1], ["With Family","Alone"]), title="Survival Rate: Alone vs With Family", palette="husl")

# Hide the last empty subplot
axes[5].axis('off')

plt.tight_layout()
# plt.show()

In [None]:
# Drop column
df = df.drop(columns=["Cabin"])

- Cabin &rarr; dropped due to excessive missing values.

In [None]:
# Correlation Heatmap
plt.figure(figsize=(8,6))

corr = df[["Survived", "Sex", "Pclass", "Fare", "Age", "SibSp", "Parch"]].corr()

sns.heatmap(corr, annot=True, cmap="coolwarm", center=0, linewidths=0.5)
plt.title("Correlation Heatmap with Survival")
plt.show()

surv_corr = corr["Survived"].sort_values(ascending=False)
print("Correlation with survival:\n", surv_corr)

- Sex &rarr; strong negative correlation (since male=1, female=0 → being male reduces survival chance).
- Pclass &rarr; negative correlation (higher class → better survival).
- Fare &rarr; positive correlation (wealthier passengers had better odds).
- Other features (Age, SibSp, Parch) have weaker correlations

In [None]:
# Encode categorical variables

df["Sex"] = LabelEncoder().fit_transform(df["Sex"])
# df = pd.get_dummies(df, columns=["Embarked","Title"], drop_first=True)

features = ["Pclass", "Sex", "Age", "Fare", "Embarked", 
            "FamilySize", "Title", "AgeGroup", "Deck"]

df_model = df[features + ["Survived"]].copy()
df_model = pd.get_dummies(df_model, drop_first=True)

display(df_model)

# Features & target
X = df_model.drop("Survived", axis=1)
y = df_model["Survived"]

X.head()
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


- Survival by gender: Women survived more than men.
- Survival by class: Higher class → higher chance of survival.
- Age distribution: Children had better survival odds.
- Heatmap of correlations: Sex, Pclass, Fare strongly correlated with survival.

##### Logistic Regression

In [None]:
# Scale & train logistic regression
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

y_pred = logreg.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

#Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Died","Survived"], yticklabels=["Died","Survived"],cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

# Feature importance
coefficients = pd.Series(logreg.coef_[0], index=X_train.columns)
coefficients = coefficients.sort_values()

plt.figure(figsize=(8,6))
coefficients.plot(kind="barh", color=["red" if c<0 else "green" for c in coefficients])
plt.title("Feature Importance (Logistic Regression Coefficients)")
plt.xlabel("Coefficient Value")
plt.show()

##### Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=400, max_depth=6, random_state=42)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="PuRd", xticklabels=["Died","Survived"], yticklabels=["Died","Survived"],cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title("Random Forest Confusion Matrix")
plt.show()


# Feature importance
importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
importances.head(10).plot(kind="barh")
plt.title("Top Feature Importances (Random Forest)")
plt.show()

##### XGBoost

In [None]:
xgb = XGBClassifier(
    n_estimators=300, 
    learning_rate=0.05, 
    max_depth=5, 
    random_state=42,
    eval_metric="logloss"
)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))

cm_xgb = confusion_matrix(y_test, y_pred_xgb)
sns.heatmap(cm_xgb, annot=True, fmt="d", cmap="Oranges",xticklabels=["Died","Survived"], yticklabels=["Died","Survived"], cbar=False)
plt.title("Confusion Matrix - XGBoost")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Compare models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier

models = {
    'Logistic Regression': LogisticRegression(max_iter = 200,class_weight='balanced'),
    'Decision Tree': DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=10, min_samples_leaf=10, random_state=42),
    'Random Forest' : RandomForestClassifier(n_estimators=200, max_depth=6, random_state=42),
    'XGBoost': XGBClassifier( n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42,eval_metric="logloss")
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred,average = "weighted")
    
    results.append([name, acc, prec, rec, f1])
    results.sort(reverse=True)

results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])
results_df

### Inference

- The Titanic dataset contains demographic and ticket-related passenger information.
- Missing values were mainly in `Age`, `Embarked`, and `Cabin`. We imputed `Age` with the median, 
  `Embarked` with the mode, and dropped `Cabin` due to excessive missingness.
- Engineered features `FamilySize` and `IsAlone` added insights into group vs. solo travel.
- EDA revealed:
  - Women had much higher survival rates than men.
  - Children had much higher survival rates as given high priority under such events of ship sinking.
  - Higher passenger classes (1st class) had better survival chances.
  - Being alone decreased the chance of survival thus family size influenced survival.
- Logistic Regression achieved a reasonable accuracy (~78–82%), showing clear patterns in the data.
- Future improvements could include more sophisticated models (Random Forest, Gradient Boosting) 
  and tuning hyperparameters for higher accuracy.

- Future improvements tried:
    - Random Forest improved performance, capturing nonlinear interactions and giving better recall for survivors.
    - XGBoost provides the best balance of accuracy and generalization on Titanic data.