# EDA (Exploratory Data Analysis)

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

### Load Dataset

In [2]:
train_df = pd.read_csv("../datasets/train.csv")

# Check for missing values
print(train_df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


### Select Features

- Dropped `PassengerId` and `Ticket` because they likely don’t provide predictive value for the model.  
- Dropped `Cabin` due to a large number of missing values.  
- Kept features that are likely informative: `Pclass`, `Sex`, `Age`, `SibSp`, `Parch`, `Fare`, `Embarked`, and derived `Title` from the Name column.


In [3]:
# Fill missing values in "Age" with the median for simplicity
train_df["Age"] = train_df["Age"].fillna(train_df["Age"].median())

# Extract title (e.g., Mr, Mrs, Miss) from "Name" for feature engineering
train_df["Title"] = train_df["Name"].str.extract(r",\s*([^\.]+)\.")

# Group rare or variant titles to reduce category cardinality
train_df["Title"] = (
    train_df["Title"]
    .replace(["Mlle", "Ms"], "Miss")
    .replace(["Mme"], "Mrs")
    .replace(
        ["Lady", "Countess", "Capt", "Col", "Don", "Dr", 
         "Major", "Rev", "Sir", "Jonkheer", "Dona"],
        "Rare"
    )
)

# Encode "Sex" as numeric (0 = female, 1 = male)
le = LabelEncoder()
train_df["Sex"] = le.fit_transform(train_df["Sex"])

# One-hot encode both "Embarked" and "Title"
train_df = pd.get_dummies(train_df, columns=["Embarked", "Title"], drop_first=True, dtype=int)

# Select final features for modeling
X = train_df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"] + 
             [col for col in train_df.columns if col.startswith("Embarked_")] + 
             [col for col in train_df.columns if col.startswith("Title_")]].copy()

# Convert to numpy arrays
X = X.values
y = train_df["Survived"].values


### Reveal shape of first examples

In [4]:
# Reveal shape and first few examples
print(f"shapes: \n X -> {X.shape}, y -> {y.shape}")
print(f"\nexamples: \n{X[:5]}")
print(f"\ntargets: \n{y[:5]}")

shapes: 
 X -> (891, 13), y -> (891,)

examples: 
[[ 3.      1.     22.      1.      0.      7.25    0.      1.      0.
   1.      0.      0.      0.    ]
 [ 1.      0.     38.      1.      0.     71.2833  0.      0.      0.
   0.      1.      0.      0.    ]
 [ 3.      0.     26.      0.      0.      7.925   0.      1.      1.
   0.      0.      0.      0.    ]
 [ 1.      0.     35.      1.      0.     53.1     0.      1.      0.
   0.      1.      0.      0.    ]
 [ 3.      1.     35.      0.      0.      8.05    0.      1.      0.
   1.      0.      0.      0.    ]]

targets: 
[0 1 1 1 0]
