In [2]:


import os, re, io, joblib, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
pd.set_option("display.max_columns", 200)


In [3]:

try:
    from google.colab import files
    print("If you want, upload a file now (e.g., Titanic-Dataset (2).csv).")
except:
    pass

df = None


local_paths = [
    "/content/Titanic-Dataset (2).csv",
    "/content/titanic.csv",
]

for p in local_paths:
    if os.path.exists(p):
        df = pd.read_csv(p)
        print(f"Loaded from {p}")
        break
if df is None:
    try:
        uploaded = files.upload()
        fname = list(uploaded.keys())[0]
        df = pd.read_csv(io.BytesIO(uploaded[fname]))
        print(f"Loaded from uploaded file: {fname}")
    except Exception as e:
        raise SystemExit("No file provided. Please upload your Titanic CSV.")

print(df.head(3))
print(df.shape, "rows x columns")


If you want, upload a file now (e.g., Titanic-Dataset (2).csv).


Saving Titanic-Dataset (2).csv to Titanic-Dataset (2).csv
Loaded from uploaded file: Titanic-Dataset (2).csv
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
(891, 12) rows x columns


In [None]:

print("\nColumns:", df.columns.tolist())
print("\nInfo:")
print(df.info())
COLUMN_MAP = {
    'survived': 'Survived',
    'pclass': 'Pclass',
    'sex': 'Sex',
    'age': 'Age',
    'sibsp': 'SibSp',
    'parch': 'Parch',
    'fare': 'Fare',
    'embarked': 'Embarked',
    'cabin': 'Cabin',
    'ticket': 'Ticket',
    'name': 'Name',
}

df = df.rename(columns={c: COLUMN_MAP.get(c, c) for c in df.columns.str.lower()})


assert 'Survived' in df.columns, "Couldn't find a 'Survived' column. Please rename your target to 'Survived'."


if df['Survived'].dtype != 'int64' and df['Survived'].dtype != 'int32':
    df['Survived'] = df['Survived'].astype(int)

print("\nTarget distribution:")
print(df['Survived'].value_counts(dropna=False))


missing = df.isna().mean().sort_values(ascending=False)
print("\nMissingness (top 15):")
print((missing * 100).round(1).head(15).astype(str) + "%")


In [None]:
# ==== Feature Engineering ====
def engineer_features(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # Ensure needed columns exist, create empties if missing (robustness)
    for col in ['Name','Cabin','Ticket','Embarked','Sex','Age','Fare','SibSp','Parch','Pclass']:
        if col not in X.columns:
            X[col] = np.nan

    # Title from Name
    title = (
        X['Name']
        .fillna('')
        .str.extract(r',\s*([^\.]+)\.', expand=False)
        .str.strip()
    )
    # Group rare titles
    title_map = {
        'Mlle':'Miss','Ms':'Miss','Mme':'Mrs','Lady':'Royalty','Countess':'Royalty','Dona':'Royalty',
        'Sir':'Royalty','Don':'Royalty','Jonkheer':'Royalty','Capt':'Officer','Col':'Officer',
        'Major':'Officer','Dr':'Officer','Rev':'Officer'
    }
    title = title.replace(title_map)
    title = title.fillna('Unknown')

    # Deck from Cabin
    deck = X['Cabin'].astype(str).str[0]
    deck = deck.where(deck.str.match(r'[A-Za-z]'), 'U').fillna('U')

    # Family features
    family_size = X['SibSp'].fillna(0).astype(float) + X['Parch'].fillna(0).astype(float) + 1
    is_alone = (family_size == 1).astype(int)

    # Ticket prefix
    ticket_prefix = (
        X['Ticket']
        .astype(str)
        .str.replace(r'\d', '', regex=True)
        .str.replace(r'[\./\s]+', '', regex=True)
        .str.upper()
    )
    ticket_prefix = ticket_prefix.replace('', 'NONE').fillna('NONE')

    # Clean Embarked
    embarked = X['Embarked'].astype(str).replace({'nan': np.nan})

    # Build final modeling frame
    out = pd.DataFrame({
        'Pclass': X['Pclass'],
        'Sex': X['Sex'],
        'Age': X['Age'],
        'SibSp': X['SibSp'],
        'Parch': X['Parch'],
        'Fare': X['Fare'],
        'Embarked': embarked,
        'Title': title,
        'Deck': deck,
        'FamilySize': family_size,
        'IsAlone': is_alone,
        'TicketPrefix': ticket_prefix
    })

    return out

engineered_preview = engineer_features(df.head(10))
engineered_preview.head()


In [None]:

X_full = df.drop(columns=['Survived'])
y = df['Survived'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

X_train.shape, X_test.shape


In [None]:


numeric_features = ['Age','SibSp','Parch','Fare','FamilySize','IsAlone','Pclass']
categorical_features = ['Sex','Embarked','Title','Deck','TicketPrefix']

numeric_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocess = ColumnTransformer(transformers=[
    ('num', numeric_pipe, numeric_features),
    ('cat', categorical_pipe, categorical_features)
])

logreg_clf = LogisticRegression(max_iter=1000)
logreg_pipeline = Pipeline(steps=[
    ('feat', FunctionTransformer(engineer_features, validate=False).set_output(transform="pandas")),
    ('preprocess', preprocess),
    ('model', logreg_clf)
])


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_auc = cross_val_score(logreg_pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)

print("LogReg CV AUC:", cv_auc.round(3), "mean:", cv_auc.mean().round(3))


LogReg CV AUC: [0.864 0.857 0.911 0.855 0.877] mean: 0.873


In [None]:

logreg_pipeline.fit(X_train, y_train)

pred = logreg_pipeline.predict(X_test)
proba = logreg_pipeline.predict_proba(X_test)[:, 1]

print("\nClassification report (LogReg):\n")
print(classification_report(y_test, pred, digits=3))
print("ROC AUC (test):", roc_auc_score(y_test, proba).round(3))

ConfusionMatrixDisplay.from_estimator(logreg_pipeline, X_test, y_test)
plt.title("Confusion Matrix – Logistic Regression")
plt.show()

RocCurveDisplay.from_predictions(y_test, proba)
plt.title("ROC – Logistic Regression")
plt.show()


In [None]:


rf = RandomForestClassifier(
    n_estimators=500,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_pipeline = Pipeline(steps=[
    ('feat', FunctionTransformer(engineer_features, validate=False).set_output(transform="pandas")),
    ('preprocess', preprocess),
    ('rf', rf)
])

param_grid = {
    'rf__max_depth': [None, 6, 10],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2],
    'rf__max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    verbose=0
)

grid.fit(X_train, y_train)

print("Best AUC (CV):", grid.best_score_.round(3))
print("Best params:", grid.best_params_)

best_rf = grid.best_estimator_
pred_rf = best_rf.predict(X_test)
proba_rf = best_rf.predict_proba(X_test)[:, 1]

print("\nClassification report (RandomForest):\n")
print(classification_report(y_test, pred_rf, digits=3))
print("ROC AUC (test):", roc_auc_score(y_test, proba_rf).round(3))

ConfusionMatrixDisplay.from_estimator(best_rf, X_test, y_test)
plt.title("Confusion Matrix – Random Forest")
plt.show()

RocCurveDisplay.from_predictions(y_test, proba_rf)
plt.title("ROC – Random Forest")
plt.show()


In [None]:

prep = best_rf.named_steps['preprocess']
ohe = prep.named_transformers_['cat'].named_steps['onehot']

cat_ohe_names = ohe.get_feature_names_out(categorical_features).tolist()
feature_names = numeric_features + cat_ohe_names

importances = best_rf.named_steps['rf'].feature_importances_
fi = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False)

fi.head(20)


In [None]:

joblib.dump(best_rf, "titanic_survival_pipeline.pkl")
print("Saved to titanic_survival_pipeline.pkl")
loaded = joblib.load("titanic_survival_pipeline.pkl")
one_passenger = pd.DataFrame([{
    "Pclass": 3,
    "Sex": "male",
    "Age": 22,
    "SibSp": 1,
    "Parch": 0,
    "Fare": 7.25,
    "Embarked": "S",
    "Cabin": None,
    "Ticket": "A/5 21171",
    "Name": "Mr. Owen Harris Braund"
}])

pred_class = loaded.predict(one_passenger)[0]
pred_prob = loaded.predict_proba(one_passenger)[0,1]
print(f"Predicted Survived: {pred_class}  |  Probability: {pred_prob:.3f}")


In [None]:
def eval_model(name, pipe, X_tr, y_tr, X_te, y_te):
    y_prob = pipe.predict_proba(X_te)[:,1]
    y_hat  = (y_prob >= 0.5).astype(int)
    report = classification_report(y_te, y_hat, digits=3, output_dict=True)
    return {
        'model': name,
        'auc': roc_auc_score(y_te, y_prob),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1': report['1']['f1-score'],
        'accuracy': report['accuracy']
    }

logreg_metrics = eval_model("LogReg", logreg_pipeline, X_train, y_train, X_test, y_test)
rf_metrics = eval_model("RandomForest", best_rf, X_train, y_train, X_test, y_test)
pd.DataFrame([logreg_metrics, rf_metrics]).round(3)


In [None]:
from sklearn.metrics import classification_report, roc_auc_score
y_pred = logreg_pipeline.predict(X_test)
y_proba = logreg_pipeline.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))
y_pred_rf = best_rf.predict(X_test)
y_proba_rf = best_rf.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred_rf))
print("AUC:", roc_auc_score(y_test, y_proba_rf))


In [None]:
import pandas as pd

new_passenger = pd.DataFrame([{
    "Pclass": 3,
    "Sex": "male",
    "Age": 22,
    "SibSp": 1,
    "Parch": 0,
    "Fare": 7.25,
    "Embarked": "S",
    "Cabin": None,
    "Ticket": "A/5 21171",
    "Name": "Mr. Owen Harris Braund"
}])

print("LogReg prediction:", logreg_pipeline.predict(new_passenger)[0])
print("LogReg survival probability:", logreg_pipeline.predict_proba(new_passenger)[0,1])

print("RandomForest prediction:", best_rf.predict(new_passenger)[0])
print("RandomForest survival probability:", best_rf.predict_proba(new_passenger)[0,1])


In [None]:
child_passenger = pd.DataFrame([{
    "Pclass": 3,
    "Sex": "female",
    "Age": 8,
    "SibSp": 1,
    "Parch": 2,
    "Fare": 21.0,
    "Embarked": "S",
    "Cabin": None,
    "Ticket": "347742",
    "Name": "Miss. Little Girl"
}])

print("LogReg:", logreg_pipeline.predict(child_passenger)[0],
      "Prob:", logreg_pipeline.predict_proba(child_passenger)[0,1])

print("RandomForest:", best_rf.predict(child_passenger)[0],
      "Prob:", best_rf.predict_proba(child_passenger)[0,1])
