In [None]:
# Kaggle Titanic Challenge

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import os

In [None]:
# Take a look at the available data files

for dirname, dirs, files in os.walk('../data'):
    print(f"Current Directory: {dirname}")
    print(f"Subdirectories: {dirs}")
    print(f"Files: {files}")

In [None]:
# Another way to do this

for dirname, dirs, files in os.walk('../data'):
    for filename in files:
        print(os.path.join(dirname, filename))

In [None]:
# Take a look at the training dataset

train_file = '../data/titanic_train.csv'
train_df = pd.read_csv(train_file)
train_df.head()

In [None]:
# Take a look at the test dataset

test_file = '../data/titanic_test.csv'
test_df = pd.read_csv(test_file)
test_df.head(5)

In [None]:
# Take a look at the shape of the two datasets

print(f"Training DF: {train_df.shape}")
print(f"    Test DF: {test_df.shape}")

In [None]:
# Take a look at null value stats across the dataset

total = train_df.isnull().sum()
print(type(total))
print(total)

In [None]:
# Function to identify missing data (will call it later...)

def missing_data(data):
    """Custom function to get some stats on missing data."""
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
# Display some stats on the features (columns)

total = train_df.count()
print(total)
print(type(total))

tt = pd.DataFrame(total)
print(tt)
print(type(tt))

In [None]:
# Create a label for the column (to replace the default numeric index)

tt.columns = ["Total"]
print(tt)

In [None]:
# Run our missing_data function

missing_data(train_df)

In [None]:
# Calculate the percentage of women who survived

women = train_df.loc[train_df.Sex == 'female']['Survived']
rate_women = sum(women) / len(women)
print("% of women who survived:", rate_women)

In [None]:
# Calculate the percentage of men who survived

men = train_df.loc[train_df.Sex == 'male']['Survived']
rate_men = sum(men) / len(men)
print("% of men who survived:", rate_men)

In [None]:
# Let's do some AI/ML stuff
# Building a random forest model

y = train_df["Survived"]                        # target variable (did the passenger survive)
features = ["Pclass", "Sex", "SibSp", "Parch"]  # attributes from the input dataset

# transform the dataset into a machine learning friendly DataFrame format
X = pd.get_dummies(train_df[features])          # features dataframe (after one-hot encoding)
X_test = pd.get_dummies(test_df[features])      # features dataframe (after one-hot encoding)

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerID': test_df.PassengerId, 'Survived': predictions})
output.to_csv('../output/submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:

# === Improved Random Forest with simple feature engineering & CV ===
# This cell leaves your earlier work intact and creates a new submission file.
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# --- Safety checks: ensure train_df and test_df exist ---
assert "train_df" in globals(), "Expected 'train_df' to be defined earlier in the notebook."
assert "test_df" in globals(), "Expected 'test_df' to be defined earlier in the notebook."

# --- Minimal feature engineering ---
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # Family size & IsAlone
    out["FamilySize"] = out["SibSp"].fillna(0) + out["Parch"].fillna(0) + 1
    out["IsAlone"] = (out["FamilySize"] == 1).astype(int)
    # Name length (a light, often useful signal)
    out["NameLength"] = out["Name"].astype(str).str.len()
    return out

train_fe = add_features(train_df)
test_fe  = add_features(test_df)

# --- Select columns ---
target_col = "Survived"
numeric_features = ["Age", "Fare", "Pclass", "SibSp", "Parch", "FamilySize", "IsAlone", "NameLength"]
categorical_features = ["Sex", "Embarked"]

X = train_fe[numeric_features + categorical_features]
y = train_fe[target_col]
X_test = test_fe[numeric_features + categorical_features]

# --- Preprocess ---
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))  # robust to outliers
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # fill missing Embarked/Sex if any
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# --- Tuned RandomForest ---
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,            # let trees grow; RF handles variance with many trees
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model = Pipeline(steps=[("prep", preprocess), ("rf", rf)])

# --- Cross-validation to sanity-check improvements ---
cv_scores = cross_val_score(model, X, y, cv=5, scoring="accuracy", n_jobs=-1)
print(f"CV accuracy (mean ± std over 5 folds): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# --- Fit on full training data and predict test ---
model.fit(X, y)
test_pred = model.predict(X_test).astype(int)

# --- Build submission ---
assert "PassengerId" in test_df.columns, "Expected 'PassengerId' in test_df for submission."

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": test_pred
})

out_csv = "submission_rf_tuned.csv"
submission.to_csv(out_csv, index=False)
print(f"Submission file written: {out_csv}")
