# Titanic Survival Prediction - Inferential Analysis

Comparing multiple classification methods:
- Logistic Regression
- K-Nearest Neighbors (k=4, k=5)
- Random Forest (2 configurations)
- Support Vector Machine (Linear, RBF)

Each method tested on:
- All features
- Selected features: Sex, Age, Pclass, Fare

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

## Data Loading

In [2]:
train_df = pd.read_csv('data/train_imputed_method2.csv')
test_df = pd.read_csv('data/test_imputed_method2.csv')

os.makedirs('output', exist_ok=True)

print(f"Train data: {train_df.shape}")
print(f"Test data: {test_df.shape}")
train_df.head()

Train data: (891, 14)
Test data: (418, 13)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,HasCabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,0


## Feature Engineering

In [3]:
def prepare_features(train, test):
    sex_map = {'male': 0, 'female': 1}
    embarked_map = {'C': 0, 'Q': 1, 'S': 2}
    title_map = {
        'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Dr': 4, 'Rev': 5,
        'Col': 6, 'Major': 7, 'Mlle': 1, 'Countess': 8, 'Ms': 1,
        'Lady': 8, 'Jonkheer': 9, 'Don': 9, 'Dona': 8, 'Mme': 2,
        'Capt': 6, 'Sir': 9
    }
    
    for i, df in enumerate([train, test]):
        print(i)
        df['Sex_encoded'] = df['Sex'].map(sex_map)
        df['Embarked_encoded'] = df['Embarked'].map(embarked_map).fillna(2)
        df['Title_encoded'] = df['Title'].map(title_map).fillna(10)
        df['FamilySize'] = df['SibSp'] + df['Parch']
        df['IsAlone'] = (df['FamilySize'] == 0).astype(int)
    
    all_features = ['Pclass', 'Sex_encoded', 'Age', 'SibSp', 'Parch', 'Fare',
                    'Embarked_encoded', 'HasCabin', 'Title_encoded', 
                    'FamilySize', 'IsAlone']
    
    selected_features = ['Sex_encoded', 'Age', 'Pclass', 'Fare']
    
    return all_features, selected_features

all_features, selected_features = prepare_features(train_df, test_df)

y = train_df['Survived']
test_ids = test_df['PassengerId']

print(f"All features ({len(all_features)}): {all_features}")
print(f"Selected features ({len(selected_features)}): {selected_features}")

0
1
All features (11): ['Pclass', 'Sex_encoded', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_encoded', 'HasCabin', 'Title_encoded', 'FamilySize', 'IsAlone']
Selected features (4): ['Sex_encoded', 'Age', 'Pclass', 'Fare']


## Train-Validation Split

In [4]:
results = []

## 1. Logistic Regression

In [5]:
for feature_set_name, features in [('All', all_features), ('Selected', selected_features)]:
    X_train = train_df[features]
    X_test = test_df[features]
    
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y, test_size=0.2, random_state=42, stratify=y
    )
    
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train_split, y_train_split)
    
    val_pred = model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    
    model.fit(X_train, y)
    test_pred = model.predict(X_test)
    
    filename = f'output/LogisticRegression_{feature_set_name}.csv'
    pd.DataFrame({'PassengerId': test_ids, 'Survived': test_pred}).to_csv(filename, index=False)
    
    results.append({
        'Method': 'Logistic Regression',
        'Features': feature_set_name,
        'Val_Accuracy': val_acc,
        'Predicted_Survived': test_pred.sum(),
        'File': filename
    })
    
    print(f"Logistic Regression - {feature_set_name} Features")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Predicted survivors: {test_pred.sum()}/418\n")

Logistic Regression - All Features
Validation Accuracy: 0.7989
Predicted survivors: 159/418

Logistic Regression - Selected Features
Validation Accuracy: 0.7821
Predicted survivors: 157/418



## 2. K-Nearest Neighbors

In [6]:
for k in [4, 5]:
    for feature_set_name, features in [('All', all_features), ('Selected', selected_features)]:
        X_train = train_df[features]
        X_test = test_df[features]
        
        X_train_split, X_val, y_train_split, y_val = train_test_split(
            X_train, y, test_size=0.2, random_state=42, stratify=y
        )
        
        scaler = StandardScaler()
        X_train_split_scaled = scaler.fit_transform(X_train_split)
        X_val_scaled = scaler.transform(X_val)
        
        model = KNeighborsClassifier(n_neighbors=k)
        model.fit(X_train_split_scaled, y_train_split)
        
        val_pred = model.predict(X_val_scaled)
        val_acc = accuracy_score(y_val, val_pred)
        
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        model.fit(X_train_scaled, y)
        test_pred = model.predict(X_test_scaled)
        
        filename = f'output/KNN_k{k}_{feature_set_name}.csv'
        pd.DataFrame({'PassengerId': test_ids, 'Survived': test_pred}).to_csv(filename, index=False)
        
        results.append({
            'Method': f'KNN (k={k})',
            'Features': feature_set_name,
            'Val_Accuracy': val_acc,
            'Predicted_Survived': test_pred.sum(),
            'File': filename
        })
        
        print(f"KNN (k={k}) - {feature_set_name} Features")
        print(f"Validation Accuracy: {val_acc:.4f}")
        print(f"Predicted survivors: {test_pred.sum()}/418\n")

KNN (k=4) - All Features
Validation Accuracy: 0.8101
Predicted survivors: 141/418

KNN (k=4) - Selected Features
Validation Accuracy: 0.7989
Predicted survivors: 143/418

KNN (k=5) - All Features
Validation Accuracy: 0.8101
Predicted survivors: 170/418

KNN (k=5) - Selected Features
Validation Accuracy: 0.8156
Predicted survivors: 164/418



## 3. Random Forest

In [7]:
rf_configs = [
    {'name': 'Conservative', 'params': {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 20, 
                                        'min_samples_leaf': 5, 'random_state': 42}},
    {'name': 'Balanced', 'params': {'n_estimators': 150, 'max_depth': 7, 'min_samples_split': 10,
                                    'min_samples_leaf': 3, 'random_state': 42}}
]

for config in rf_configs:
    for feature_set_name, features in [('All', all_features), ('Selected', selected_features)]:
        X_train = train_df[features]
        X_test = test_df[features]
        
        X_train_split, X_val, y_train_split, y_val = train_test_split(
            X_train, y, test_size=0.2, random_state=42, stratify=y
        )
        
        model = RandomForestClassifier(**config['params'])
        model.fit(X_train_split, y_train_split)
        
        val_pred = model.predict(X_val)
        val_acc = accuracy_score(y_val, val_pred)
        
        model.fit(X_train, y)
        test_pred = model.predict(X_test)
        
        filename = f'output/RandomForest_{config["name"]}_{feature_set_name}.csv'
        pd.DataFrame({'PassengerId': test_ids, 'Survived': test_pred}).to_csv(filename, index=False)
        
        results.append({
            'Method': f'Random Forest ({config["name"]})',
            'Features': feature_set_name,
            'Val_Accuracy': val_acc,
            'Predicted_Survived': test_pred.sum(),
            'File': filename
        })
        
        print(f"Random Forest ({config['name']}) - {feature_set_name} Features")
        print(f"Validation Accuracy: {val_acc:.4f}")
        print(f"Predicted survivors: {test_pred.sum()}/418\n")

Random Forest (Conservative) - All Features
Validation Accuracy: 0.8156
Predicted survivors: 161/418

Random Forest (Conservative) - Selected Features
Validation Accuracy: 0.7765
Predicted survivors: 141/418

Random Forest (Balanced) - All Features
Validation Accuracy: 0.8101
Predicted survivors: 158/418

Random Forest (Balanced) - Selected Features
Validation Accuracy: 0.8045
Predicted survivors: 143/418



## 4. Support Vector Machine

In [8]:
svm_configs = [
    {'name': 'Linear', 'params': {'kernel': 'linear', 'C': 1.0, 'random_state': 42}},
    {'name': 'RBF', 'params': {'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale', 'random_state': 42}}
]

for config in svm_configs:
    for feature_set_name, features in [('All', all_features), ('Selected', selected_features)]:
        X_train = train_df[features]
        X_test = test_df[features]
        
        X_train_split, X_val, y_train_split, y_val = train_test_split(
            X_train, y, test_size=0.2, random_state=42, stratify=y
        )
        
        scaler = StandardScaler()
        X_train_split_scaled = scaler.fit_transform(X_train_split)
        X_val_scaled = scaler.transform(X_val)
        
        model = SVC(**config['params'])
        model.fit(X_train_split_scaled, y_train_split)
        
        val_pred = model.predict(X_val_scaled)
        val_acc = accuracy_score(y_val, val_pred)
        
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        model.fit(X_train_scaled, y)
        test_pred = model.predict(X_test_scaled)
        
        filename = f'output/SVM_{config["name"]}_{feature_set_name}.csv'
        pd.DataFrame({'PassengerId': test_ids, 'Survived': test_pred}).to_csv(filename, index=False)
        
        results.append({
            'Method': f'SVM ({config["name"]})',
            'Features': feature_set_name,
            'Val_Accuracy': val_acc,
            'Predicted_Survived': test_pred.sum(),
            'File': filename
        })
        
        print(f"SVM ({config['name']}) - {feature_set_name} Features")
        print(f"Validation Accuracy: {val_acc:.4f}")
        print(f"Predicted survivors: {test_pred.sum()}/418\n")

SVM (Linear) - All Features
Validation Accuracy: 0.7709
Predicted survivors: 149/418

SVM (Linear) - Selected Features
Validation Accuracy: 0.7765
Predicted survivors: 152/418

SVM (RBF) - All Features
Validation Accuracy: 0.8212
Predicted survivors: 161/418

SVM (RBF) - Selected Features
Validation Accuracy: 0.7989
Predicted survivors: 136/418



## Results Summary

In [9]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Val_Accuracy', ascending=False)
results_df['Val_Accuracy'] = results_df['Val_Accuracy'].apply(lambda x: f"{x:.4f}")

print("\n" + "="*80)
print("INFERENTIAL ANALYSIS SUMMARY")
print("="*80)
print(results_df.to_string(index=False))
print("\nAll predictions saved to output/ folder")

results_df.to_csv('output/summary_results.csv', index=False)
results_df


INFERENTIAL ANALYSIS SUMMARY
                      Method Features Val_Accuracy  Predicted_Survived                                          File
                   SVM (RBF)      All       0.8212                 161                        output/SVM_RBF_All.csv
                   KNN (k=5) Selected       0.8156                 164                    output/KNN_k5_Selected.csv
Random Forest (Conservative)      All       0.8156                 161      output/RandomForest_Conservative_All.csv
                   KNN (k=4)      All       0.8101                 141                         output/KNN_k4_All.csv
                   KNN (k=5)      All       0.8101                 170                         output/KNN_k5_All.csv
    Random Forest (Balanced)      All       0.8101                 158          output/RandomForest_Balanced_All.csv
    Random Forest (Balanced) Selected       0.8045                 143     output/RandomForest_Balanced_Selected.csv
         Logistic Regression      

Unnamed: 0,Method,Features,Val_Accuracy,Predicted_Survived,File
12,SVM (RBF),All,0.8212,161,output/SVM_RBF_All.csv
5,KNN (k=5),Selected,0.8156,164,output/KNN_k5_Selected.csv
6,Random Forest (Conservative),All,0.8156,161,output/RandomForest_Conservative_All.csv
2,KNN (k=4),All,0.8101,141,output/KNN_k4_All.csv
4,KNN (k=5),All,0.8101,170,output/KNN_k5_All.csv
8,Random Forest (Balanced),All,0.8101,158,output/RandomForest_Balanced_All.csv
9,Random Forest (Balanced),Selected,0.8045,143,output/RandomForest_Balanced_Selected.csv
0,Logistic Regression,All,0.7989,159,output/LogisticRegression_All.csv
3,KNN (k=4),Selected,0.7989,143,output/KNN_k4_Selected.csv
13,SVM (RBF),Selected,0.7989,136,output/SVM_RBF_Selected.csv
