In [None]:
# Cell 1: Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

In [None]:
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

print(df.head())
print(f"\nShape: {df.shape}")

In [None]:
#who survived
sns.countplot(x='Survived', data=df)
plt.title("Distribution of Survival (0=Died, 1=Survived)")
plt.show()

In [None]:
# Survival by Gender
sns.barplot(x='Sex', y='Survived', data=df)
plt.title("Survival Rate by Gender")
plt.show()
# Observation: Females had a much higher chance of survival.

In [None]:
# 3. DATA PREPROCESSING
# A. Drop useless columns
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# B. Handle Missing Values
# Fill Age with median, Embarked with mode
imputer_age = SimpleImputer(strategy='median')
df['Age'] = imputer_age.fit_transform(df[['Age']])

df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# C. Categorical Encoding (Text -> Numbers)
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex']) # male/female -> 1/0
df['Embarked'] = le.fit_transform(df['Embarked'])

print("\n--- Cleaned Data ---")
print(df.head())

In [None]:
X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# A. Logistic Regression (Baseline)
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
acc_log = accuracy_score(y_test, log_reg.predict(X_test))

# B. Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
acc_rf = accuracy_score(y_test, rf.predict(X_test))

# C. XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
acc_xgb = accuracy_score(y_test, xgb_model.predict(X_test))

In [None]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [acc_log, acc_rf, acc_xgb]
})

print("\n--- Model Leaderboard ---")
print(results.sort_values(by='Accuracy', ascending=False))

In [None]:
# Feature Importance from XGBoost
from xgboost import plot_importance
plot_importance(xgb_model)
plt.show()