In [None]:
"""
Titanic Data Analysis and JSON Export
Author: Elsa Bakiu
Description: Analyze Titanic passenger data, engineer features, and export to JSON
"""

import pandas as pd
import numpy as np
import json
from pathlib import Path

# Set up paths
DATA_DIR = Path("data")
CSV_FILE = DATA_DIR / "titanic.csv"
JSON_FILE = DATA_DIR / "titanic_data.json"

# Create data directory if it doesn't exist
DATA_DIR.mkdir(exist_ok=True)

print("Project setup complete!")
print(f"Data directory: {DATA_DIR}")
print(f"CSV file location: {CSV_FILE}")


Project setup complete!
Data directory: data
CSV file location: data/titanic.csv


In [5]:
df = pd.read_csv(CSV_FILE)
print(f"Dataset loaded successfully! Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())


Dataset loaded successfully! Shape: (891, 12)

Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

First few rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 175

In [7]:
df = pd.read_csv(CSV_FILE)
# Select numeric columns only
numeric_df = df.select_dtypes(include="number")
# Calculate statistics for each numeric column
stats = numeric_df.agg(["mean", "median", "std"])
# Print the transposed statistics for better readability
stats_t = stats.T
print(stats_t)


                   mean    median         std
PassengerId  446.000000  446.0000  257.353842
Survived       0.383838    0.0000    0.486592
Pclass         2.308642    3.0000    0.836071
Age           29.699118   28.0000   14.526497
SibSp          0.523008    0.0000    1.102743
Parch          0.381594    0.0000    0.806057
Fare          32.204208   14.4542   49.693429


In [14]:
df = pd.read_csv(CSV_FILE)
# Count missing values
print("\n" + "="*50)
print("MISSING VALUES ANALYSIS")
print("="*50)
 
missing_data = {}
 
for col in df.columns:
    missing_count = df[col].isna().sum()
    missing_percent = (missing_count / len(df)) * 100

    missing_data[col] = {
        "missing_count": missing_count,
        "missing_percent": round(missing_percent, 2)
    }

# Print missing data summary
for col, stats in missing_data.items():
    print(f"{col}: {stats['missing_count']} missing values ({stats['missing_percent']}%)")


MISSING VALUES ANALYSIS
PassengerId: 0 missing values (0.0%)
Survived: 0 missing values (0.0%)
Pclass: 0 missing values (0.0%)
Name: 0 missing values (0.0%)
Sex: 0 missing values (0.0%)
Age: 177 missing values (19.87%)
SibSp: 0 missing values (0.0%)
Parch: 0 missing values (0.0%)
Ticket: 0 missing values (0.0%)
Fare: 0 missing values (0.0%)
Cabin: 687 missing values (77.1%)
Embarked: 2 missing values (0.22%)


In [16]:
# Create a copy of the dataframe for feature engineering
df_features = df.copy()
 
# Feature 1: Family Size
df_features['FamilySize'] = df_features['FamilySize'] = df_features['SibSp'] + df_features['Parch'] + 1
print(df_features[['SibSp', 'Parch', 'FamilySize']].head(10))
 
# Feature 2: Is Alone
df_features['IsAlone'] = df_features['IsAlone'] = (df_features['FamilySize'] == 1).astype(int)
print(df_features[['FamilySize', 'IsAlone']].head(10))
 
# Feature 3: Age Groups
def categorize_age(age):
    """Categorize age into groups"""
    if pd.isna(age):
        return 'Unknown'
    elif age < 18:
        return 'name your category'
    elif age < 30:
        return 'name your category'
    elif age < 50:
        return 'name your category'
    else:
        return 'name your category'
 
df_features['AgeGroup'] = df_features['AgeGroup'] = df_features['Age'].apply(categorize_age)
print(df_features[['Age', 'AgeGroup']].head(10))
 
# Analyze feature differences between survivors and non-survivors
print("\n" + "="*50)
print("FEATURE ANALYSIS: SURVIVED vs NOT SURVIVED")
print("="*50)
 
# Here is an example to get you started:
print("\nFamily Size by Survival:")
family_survival = df_features.groupby('Survived')['FamilySize'].agg(['mean', 'median', 'std'])
print(family_survival)
 
# Statistical test: Do these features help differentiate?
print("\n" + "="*50)
print("FEATURE DIFFERENTIATION ANALYSIS")
print("="*50)
 
survived = df_features[df_features['Survived'] == 1]
not_survived = df_features[df_features['Survived'] == 0]
 
print("\nFamily Size:")
print(f"  Survived mean: {survived['FamilySize'].mean():.2f}")
print(f"  Not Survived mean: {not_survived['FamilySize'].mean():.2f}")
print(f"  Difference: {abs(survived['FamilySize'].mean() - not_survived['FamilySize'].mean()):.2f}")

   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1
5      0      0           1
6      0      0           1
7      3      1           5
8      0      2           3
9      1      0           2
   FamilySize  IsAlone
0           2        0
1           2        0
2           1        1
3           2        0
4           1        1
5           1        1
6           1        1
7           5        0
8           3        0
9           2        0
    Age            AgeGroup
0  22.0  name your category
1  38.0  name your category
2  26.0  name your category
3  35.0  name your category
4  35.0  name your category
5   NaN             Unknown
6  54.0  name your category
7   2.0  name your category
8  27.0  name your category
9  14.0  name your category

FEATURE ANALYSIS: SURVIVED vs NOT SURVIVED

Family Size by Survival:
              mean  median       std
Survived                

In [25]:
import json
from datetime import datetime

import pandas as pd


# -----------------------------
# 1) Load CSV into df
# -----------------------------
df = pd.read_csv(CSV_FILE)


# -----------------------------
# 2) Create df_engineered
#    - FamilySize = SibSp + Parch + 1
#    - IsAlone = 1 if FamilySize == 1 else 0
#    - Title extracted from Name
# -----------------------------
df_engineered = df.copy()

# FamilySize
df_engineered["FamilySize"] = (
    df_engineered["SibSp"].fillna(0).astype(int)
    + df_engineered["Parch"].fillna(0).astype(int)
    + 1
)

# IsAlone
df_engineered["IsAlone"] = (df_engineered["FamilySize"] == 1).astype(int)


def extract_title(name):
    if not isinstance(name, str) or "," not in name or "." not in name:
        return None
    try:
        return name.split(",")[1].split(".")[0].strip()
    except Exception:
        return None


df_engineered["Title"] = df_engineered["Name"].apply(extract_title)


# -----------------------------
# 3) Passenger + TitanicDataset classes
# -----------------------------
class Passenger:
    """
    Represents a passenger with all their information.
    """

    def __init__(
        self,
        passenger_id,
        name,
        age,
        sex,
        survived,
        pclass,
        fare,
        embarked=None,
        sibsp=None,
        parch=None,
        family_size=None,
        is_alone=None,
        title=None,
    ):
        # Core identifiers
        self.passenger_id = int(passenger_id) if pd.notna(passenger_id) else None
        self.name = str(name) if pd.notna(name) else None

        # Survival and class
        self.survived = int(survived) if pd.notna(survived) else None
        self.pclass = int(pclass) if pd.notna(pclass) else None

        # Demographics
        self.age = float(age) if pd.notna(age) else None
        self.sex = str(sex) if pd.notna(sex) else None

        # Ticket / travel
        self.fare = float(fare) if pd.notna(fare) else None
        self.embarked = str(embarked) if pd.notna(embarked) else None

        # Family components
        self.sibsp = int(sibsp) if pd.notna(sibsp) else None
        self.parch = int(parch) if pd.notna(parch) else None

        # Engineered features
        self.family_size = int(family_size) if pd.notna(family_size) else None
        self.is_alone = int(is_alone) if pd.notna(is_alone) else None
        self.title = str(title) if pd.notna(title) else None

    def to_dict(self):
        """Convert passenger to dictionary for JSON serialization."""
        return {
            "passenger_id": self.passenger_id,
            "name": self.name,
            "age": self.age,
            "sex": self.sex,
            "survived": self.survived,
            "pclass": self.pclass,
            "fare": self.fare,
            "embarked": self.embarked,
            "sibsp": self.sibsp,
            "parch": self.parch,
            "family_size": self.family_size,
            "is_alone": self.is_alone,
            "title": self.title,
        }


class TitanicDataset:
    """
    Represents the entire Titanic dataset with methods for analysis and JSON export.
    """

    def __init__(self, dataframe: pd.DataFrame):
        if dataframe is None or dataframe.empty:
            raise ValueError("TitanicDataset requires a non-empty DataFrame.")

        self.dataframe = dataframe.copy()
        self.passengers = []
        self._create_passengers()

    def _create_passengers(self):
        """Create Passenger objects from dataframe."""
        self.passengers = []
        for idx, row in self.dataframe.iterrows():
            passenger = Passenger(
                passenger_id=row.get("PassengerId", idx),
                name=row.get("Name", None),
                age=row.get("Age", None),
                sex=row.get("Sex", None),
                survived=row.get("Survived", None),
                pclass=row.get("Pclass", None),
                fare=row.get("Fare", None),
                embarked=row.get("Embarked", None),
                sibsp=row.get("SibSp", None),
                parch=row.get("Parch", None),
                family_size=row.get("FamilySize", None),
                is_alone=row.get("IsAlone", None),
                title=row.get("Title", None),
            )
            self.passengers.append(passenger)

    # ---------- Missing values ----------

    def get_missing_values_summary(self):
        """
        Returns:
        { "ColumnName": {"missing_count": int, "missing_percent": float}, ... }
        """
        df = self.dataframe
        missing_count = df.isna().sum()
        missing_percent = (df.isna().mean() * 100).round(2)

        summary = {}
        for col in df.columns:
            summary[col] = {
                "missing_count": int(missing_count[col]),
                "missing_percent": float(missing_percent[col]),
            }
        return summary

    def get_columns_with_most_missing(self, top_n=10):
        """
        Returns list sorted by missing_percent descending:
        [(col, missing_count, missing_percent), ...]
        """
        summary = self.get_missing_values_summary()
        ranked = sorted(
            [(c, v["missing_count"], v["missing_percent"]) for c, v in summary.items()],
            key=lambda x: x[2],
            reverse=True,
        )
        return ranked[:top_n]

    # ---------- Numeric stats ----------

    def get_numeric_stats(self, round_to=3):
        """
        mean, median, std for each numeric column.
        Returns:
        { "colA": {"mean": .., "median": .., "std": ..}, ... }
        """
        num_df = self.dataframe.select_dtypes(include="number")
        if num_df.empty:
            return {}

        stats = num_df.agg(["mean", "median", "std"]).T.round(round_to)

        out = {}
        for col in stats.index:
            out[col] = {
                "mean": None if pd.isna(stats.loc[col, "mean"]) else float(stats.loc[col, "mean"]),
                "median": None if pd.isna(stats.loc[col, "median"]) else float(stats.loc[col, "median"]),
                "std": None if pd.isna(stats.loc[col, "std"]) else float(stats.loc[col, "std"]),
            }
        return out

    # ---------- Summary stats ----------

    def get_summary_stats(self):
        total_passengers = len(self.passengers)

        survived_count = sum(1 for p in self.passengers if p.survived == 1)
        did_not_survive = sum(1 for p in self.passengers if p.survived == 0)

        ages = [p.age for p in self.passengers if p.age is not None]
        fares = [p.fare for p in self.passengers if p.fare is not None]
        family_sizes = [p.family_size for p in self.passengers if p.family_size is not None]

        average_age = round(sum(ages) / len(ages), 3) if ages else None
        average_fare = round(sum(fares) / len(fares), 3) if fares else None
        average_family_size = round(sum(family_sizes) / len(family_sizes), 3) if family_sizes else None

        survival_rate = None
        if "Survived" in self.dataframe.columns and not self.dataframe["Survived"].isna().all():
            survival_rate = float(self.dataframe["Survived"].mean())

        return {
            "total_passengers": total_passengers,
            "survived": survived_count,
            "did_not_survive": did_not_survive,
            "survival_rate": survival_rate,
            "average_age": average_age,
            "average_fare": average_fare,
            "average_family_size": average_family_size,
        }

    # ---------- JSON export ----------

    def to_json(self, filename="titanic_data.json", indent=2):
        data = {
            "metadata": {
                "dataset_name": "Titanic Passenger Dataset",
                "export_date": datetime.now().isoformat(),
                "rows": int(self.dataframe.shape[0]),
                "columns": int(self.dataframe.shape[1]),
                "column_names": list(self.dataframe.columns),
                "summary_stats": self.get_summary_stats(),
                "missing_values": self.get_missing_values_summary(),
                "top_missing_columns": [
                    {"column": c, "missing_count": mc, "missing_percent": mp}
                    for (c, mc, mp) in self.get_columns_with_most_missing(top_n=10)
                ],
                "numeric_column_stats": self.get_numeric_stats(round_to=3),
                "engineered_features": {
                    "FamilySize": "SibSp + Parch + 1",
                    "IsAlone": "1 if FamilySize == 1 else 0",
                    "Title": "Extracted from Name (text between ',' and '.')",
                },
            },
            "passengers": [p.to_dict() for p in self.passengers],
        }

        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=indent, ensure_ascii=False)

        print(f"Data exported to {filename}")
        return data


# -----------------------------
# 4) Create dataset, print stats, optional JSON export
# -----------------------------
dataset = TitanicDataset(df_engineered)

print("\n" + "=" * 50)
print("BASIC DATASET INFO")
print("=" * 50)
print("Rows:", dataset.dataframe.shape[0])
print("Columns:", dataset.dataframe.shape[1])

print("\n" + "=" * 50)
print("SUMMARY STATS")
print("=" * 50)
summary = dataset.get_summary_stats()
for k, v in summary.items():
    print(f"{k}: {v}")

print("\n" + "=" * 50)
print("TOP MISSING COLUMNS")
print("=" * 50)
for col, mc, mp in dataset.get_columns_with_most_missing(top_n=10):
    if mc > 0:
        print(f"{col}: {mc} missing ({mp}%)")

#Export to JSON
dataset.to_json(JSON_FILE)



BASIC DATASET INFO
Rows: 891
Columns: 15

SUMMARY STATS
total_passengers: 891
survived: 342
did_not_survive: 549
survival_rate: 0.3838383838383838
average_age: 29.699
average_fare: 32.204
average_family_size: 1.905

TOP MISSING COLUMNS
Cabin: 687 missing (77.1%)
Age: 177 missing (19.87%)
Embarked: 2 missing (0.22%)
Data exported to data/titanic_data.json


{'metadata': {'dataset_name': 'Titanic Passenger Dataset',
  'export_date': '2026-02-03T18:09:39.010376',
  'rows': 891,
  'columns': 15,
  'column_names': ['PassengerId',
   'Survived',
   'Pclass',
   'Name',
   'Sex',
   'Age',
   'SibSp',
   'Parch',
   'Ticket',
   'Fare',
   'Cabin',
   'Embarked',
   'FamilySize',
   'IsAlone',
   'Title'],
  'summary_stats': {'total_passengers': 891,
   'survived': 342,
   'did_not_survive': 549,
   'survival_rate': 0.3838383838383838,
   'average_age': 29.699,
   'average_fare': 32.204,
   'average_family_size': 1.905},
  'missing_values': {'PassengerId': {'missing_count': 0,
    'missing_percent': 0.0},
   'Survived': {'missing_count': 0, 'missing_percent': 0.0},
   'Pclass': {'missing_count': 0, 'missing_percent': 0.0},
   'Name': {'missing_count': 0, 'missing_percent': 0.0},
   'Sex': {'missing_count': 0, 'missing_percent': 0.0},
   'Age': {'missing_count': 177, 'missing_percent': 19.87},
   'SibSp': {'missing_count': 0, 'missing_percent': 

In [26]:
# Additional validation: Load and inspect JSON
with open(JSON_FILE, 'r', encoding='utf-8') as f:
    json_data = json.load(f)

# Print summary of JSON data and verify content
print("\n" + "=" * 50)
print("JSON DATA INSPECTION")
print("=" * 50)
print("Metadata:")
for k, v in json_data.get("metadata", {}).items():
    print(f"  {k}: {v}")    


JSON DATA INSPECTION
Metadata:
  dataset_name: Titanic Passenger Dataset
  export_date: 2026-02-03T18:09:39.010376
  rows: 891
  columns: 15
  column_names: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize', 'IsAlone', 'Title']
  summary_stats: {'total_passengers': 891, 'survived': 342, 'did_not_survive': 549, 'survival_rate': 0.3838383838383838, 'average_age': 29.699, 'average_fare': 32.204, 'average_family_size': 1.905}
  missing_values: {'PassengerId': {'missing_count': 0, 'missing_percent': 0.0}, 'Survived': {'missing_count': 0, 'missing_percent': 0.0}, 'Pclass': {'missing_count': 0, 'missing_percent': 0.0}, 'Name': {'missing_count': 0, 'missing_percent': 0.0}, 'Sex': {'missing_count': 0, 'missing_percent': 0.0}, 'Age': {'missing_count': 177, 'missing_percent': 19.87}, 'SibSp': {'missing_count': 0, 'missing_percent': 0.0}, 'Parch': {'missing_count': 0, 'missing_percent': 0.0}, 'Ticket': {'missing_count