In [93]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split

# ===============================
# Load data
# ===============================
train = pd.read_csv('/media/prince/5A4E832F4E83034D/TItanic Project/cleaned.csv')
test = pd.read_csv('/media/prince/5A4E832F4E83034D/TItanic Project/cleaned_test.csv')

X = train.drop(columns=["Survived"])
y = train["Survived"]

X_test = test.copy()

# ===============================
# Train / validation split
# ===============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ===============================
# Columns
# ===============================
cat_cols = ['Sex', 'Embarked']
num_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

# ===============================
# Imputation
# ===============================
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

X_train_num = num_imputer.fit_transform(X_train[num_cols])
X_val_num   = num_imputer.transform(X_val[num_cols])
X_test_num  = num_imputer.transform(X_test[num_cols])

X_train_cat = cat_imputer.fit_transform(X_train[cat_cols])
X_val_cat   = cat_imputer.transform(X_val[cat_cols])
X_test_cat  = cat_imputer.transform(X_test[cat_cols])

# ===============================
# Encoding
# ===============================
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_train_cat = ohe.fit_transform(X_train_cat)
X_val_cat   = ohe.transform(X_val_cat)
X_test_cat  = ohe.transform(X_test_cat)

# ===============================
# Scaling (ONCE)
# ===============================
scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train_num)
X_val_num   = scaler.transform(X_val_num)
X_test_num  = scaler.transform(X_test_num)

# ===============================
# Final matrices
# ===============================
X_train_final = np.hstack([X_train_num, X_train_cat])
X_val_final   = np.hstack([X_val_num, X_val_cat])
X_test_final  = np.hstack([X_test_num, X_test_cat])

# ===============================
# Model
# ===============================
model = LogisticRegression(max_iter=1000)
model.fit(X_train_final, y_train)

# ===============================
# Evaluation (CORRECT)
# ===============================
y_val_pred = model.predict(X_val_final)
y_val_prob = model.predict_proba(X_val_final)[:, 1]

accuracy = accuracy_score(y_val, y_val_pred)
loss = log_loss(y_val, y_val_prob)

print(f"Validation Accuracy: {accuracy * 100:.2f}%")
print(f"Validation Log Loss: {loss:.4f}")

# ===============================
# Test predictions
# ===============================
test_predictions = model.predict(X_test_final)
test_probabilities = model.predict_proba(X_test_final)[:, 1]

print("Test predictions generated successfully.")


Validation Accuracy: 82.12%
Validation Log Loss: 0.4632
Test predictions generated successfully.
