# Titanic - Kaggle Project

##  Import libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


## 1. Data Preprocessing Helpers
### Helper functions:

#### handle_missing_values

#### encode_categorical

#### scale_numeric

#### drop_unnecessary

#### handle_outliers

#### preprocess_data

#### split_dat

In [None]:


def handle_missing_values(df):
    """Handle missing values: Cabin -> extract letter, Embarked -> fill with mode, Age -> imputation"""
    df_processed = df.copy()

    # Cabin -> Extract first letter, drop original
    if 'Cabin' in df_processed.columns:
        df_processed['Cabin_Letter'] = df_processed['Cabin'].astype(str).str[0]
        df_processed = df_processed.drop('Cabin', axis=1)

    # Embarked -> Fill with mode
    if 'Embarked' in df_processed.columns:
        embarked_mode = df_processed['Embarked'].mode()[0]
        df_processed['Embarked'] = df_processed['Embarked'].fillna(embarked_mode)

    # Age -> Imputation
    if 'Age' in df_processed.columns:
        # Group median by Pclass & Sex
        age_median_by_group = df_processed.groupby(['Pclass', 'Sex'])['Age'].median()
        df_processed['Age'] = df_processed.apply(
            lambda row: age_median_by_group[(row['Pclass'], row['Sex'])] 
            if pd.isna(row['Age']) else row['Age'], axis=1
        )

        # KNN Imputation for numerical columns
        numeric_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
        for col in ['Survived', 'PassengerId']:
            if col in numeric_cols:
                numeric_cols.remove(col)
        if numeric_cols:
            knn_imputer = KNNImputer(n_neighbors=5)
            df_processed[numeric_cols] = knn_imputer.fit_transform(df_processed[numeric_cols])

    return df_processed


def encode_categorical(df):
    """One-Hot Encoding for categorical columns"""
    categorical_cols = ['Sex', 'Embarked', 'Cabin_Letter']
    categorical_cols = [c for c in categorical_cols if c in df.columns]
    if categorical_cols:
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    return df


def scale_numeric(df):
    """Standardize numeric columns"""
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'Survived' in numeric_cols:
        numeric_cols.remove('Survived')
    if 'PassengerId' in numeric_cols:
        numeric_cols.remove('PassengerId')
    if numeric_cols:
        scaler = StandardScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df


def drop_unnecessary(df, cols):
    """Drop unnecessary columns if exist"""
    cols = [c for c in cols if c in df.columns]
    return df.drop(columns=cols)


def preprocess_data(df, is_train=True):
    """Full preprocessing pipeline"""
    df = handle_missing_values(df)
    df = encode_categorical(df)
    df = scale_numeric(df)
    df = drop_unnecessary(df, ['PassengerId', 'Name', 'Ticket'])
    return df


def split_data(df, target_column='Survived'):
    """Split dataframe into train/test sets"""
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



## 2. Load and Preprocess Data

In [11]:
df_train = pd.read_csv('Desktop/coding/AI exercise/ML_titanic_task/train.csv')
df_test = pd.read_csv('Desktop/coding/AI exercise/ML_titanic_task/test.csv')

df_train_clean = preprocess_data(df_train, is_train=True)
X_train, X_val, y_train, y_val = split_data(df_train_clean, target_column='Survived')


## 3. Model Training

### Model training and evaluation:

#### Decision Tree & Random Forest

#### Cross Validation

#### Grid Search

In [13]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_val)
print(f"Decision Tree Accuracy: {accuracy_score(y_val, dt_preds):.4f}")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_val)
print(f"Random Forest Accuracy: {accuracy_score(y_val, rf_preds):.4f}")

# Cross-validation
cv_scores = cross_val_score(rf_model, df_train_clean.drop(columns=['Survived']), df_train_clean['Survived'], cv=5)
print(f"Cross-Validation Accuracies: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")

# Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(df_train_clean.drop(columns=['Survived']), df_train_clean['Survived'])
print(f"Best Params: {grid_search.best_params_}")
print(f"Best CV Accuracy: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_

Decision Tree Accuracy: 0.7933
Random Forest Accuracy: 0.8045
Cross-Validation Accuracies: [0.77653631 0.78651685 0.84269663 0.74719101 0.83707865]
Mean CV Accuracy: 0.7980
Best Params: {'max_depth': 5, 'n_estimators': 200}
Best CV Accuracy: 0.8306


## 4. Prediction & Submission

In [20]:
df_test_clean = preprocess_data(df_test, is_train=False)

# Align columns with training set
df_test_clean = df_test_clean.reindex(columns=df_train_clean.drop(columns=['Survived']).columns, fill_value=0)

# Predict
predictions = best_model.predict(df_test_clean)

# Create submission file
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": predictions
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv created successfully")


✅ submission.csv created successfully
