# Titanic Survival Prediction

This notebook performs data preprocessing, training, and evaluation of multiple machine learning models on the Titanic dataset.

## Data Loading

We start by loading the Titanic dataset into Pandas DataFrames.

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron, LogisticRegression, SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load Titanic dataset
train_df = pd.read_csv('./input/train.csv')
test_df = pd.read_csv('./input/test.csv')

print("Train Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)

## Data Preprocessing

We handle missing values, feature engineering, and encoding to prepare the dataset for machine learning models.

In [None]:
# Fill missing Age values with median age grouped by Pclass and Survived
train_df['Age'] = train_df.groupby(['Pclass', 'Survived'])['Age'].transform(lambda x: x.fillna(x.median()))
test_df['Age'] = test_df.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.median()))

# Fill missing Embarked values with the most frequent value in train_df
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode()[0])

# Fill missing Fare values with the median
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())

# Apply transformations to both datasets
for df in [train_df, test_df]:
    df['Age'] = df.groupby(['Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior'])
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    label_encoder = LabelEncoder()
    for column in ['Sex', 'Embarked', 'AgeBin', 'Title']:
        df[column] = label_encoder.fit_transform(df[column].astype(str))

## Feature Selection

Dropping unused columns to keep only the relevant features.

In [None]:
# Drop unused columns, ensuring PassengerId is retained in test_df for submission purposes
train_df.drop(['Ticket', 'Cabin', 'Name'], axis=1, inplace=True)
test_df.drop(['Ticket', 'Cabin', 'Name'], axis=1, inplace=True)

# Define training and test datasets
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test = test_df.drop("PassengerId", axis=1).reindex(columns=X_train.columns, fill_value=0)

# Verify matching columns
print("Columns in X_train:", X_train.columns)
print("Columns in X_test:", X_test.columns)

## Model Training and Evaluation

We train a Decision Tree classifier and evaluate its accuracy.

In [None]:
# Train Decision Tree Classifier
decision_tree = DecisionTreeClassifier(max_depth=5, random_state=42)
decision_tree.fit(X_train, Y_train)

# Predict on training data and evaluate
train_pred = decision_tree.predict(X_train)
print(f"Training Accuracy: {accuracy_score(Y_train, train_pred):.2f}")

## Comparing Multiple Models

We train and compare multiple models to evaluate performance.

In [None]:
models = {
    "Support Vector Machines": SVC(),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": GaussianNB(),
    "Perceptron": Perceptron(),
    "Stochastic Gradient Decent": SGDClassifier(),
    "Linear SVC": LinearSVC(),
    "Decision Tree": DecisionTreeClassifier()
}

accuracies = {}

for name, model in models.items():
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_train)
    accuracies[name] = round(accuracy_score(Y_train, Y_pred) * 100, 2)

# Store model performance in DataFrame
models_df = pd.DataFrame(list(accuracies.items()), columns=["Model", "Score"])
models_df.sort_values(by="Score", ascending=False, inplace=True)

# Display results
print(models_df)

## Predictions and Submission

We generate predictions on the test set and save them for submission.

In [None]:
# Predict on test data
test_pred = decision_tree.predict(X_test)

# Export predictions for submission
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": test_pred
})
submission.to_csv("submission.csv", index=False)

print("Submission file 'submission.csv' created successfully!")