In [None]:
import pandas as pd
import json

train = pd.read_csv("../data/processed/train_features.csv")
test = pd.read_csv("../data/processed/test_features.csv")

# -------------------------------------------------------------
# 1. DEFINE THE DROP LIST *FIRST*
# -------------------------------------------------------------
cols_to_drop = [
    # Categorical engineered features
    "Ticket_strong",
    "TicketPrefix",
    "CabinDeck",
    "AgeBand",
    "FareBand",
    "Sex_Pclass",
    "Sex_AgeBand",
    "Embarked_Pclass",
    "Title_AgeBand",
    "Pclass_FareBand",

    # Numeric engineered features
    "FareMissing",
    "CabinKnown",
    "FamilySize",
    "IsAlone",
    "LogFare",
    "FarePerPerson",
    "TicketGroupSize",
    "TicketGroupFarePerPerson"
]

# -------------------------------------------------------------
# 2. NOW compute how many actually exist in the dataframe
# -------------------------------------------------------------
actual_dropped = [c for c in cols_to_drop if c in train.columns]

print(f"Requested to drop: {len(cols_to_drop)}")
print(f"Actually dropped:  {len(actual_dropped)}")
print("Columns that will be dropped:")
for c in actual_dropped:
    print(f"  - {c}")

# -------------------------------------------------------------
# 3. Drop the columns
# -------------------------------------------------------------
train = train.drop(columns=actual_dropped)
test = test.drop(columns=actual_dropped)

# -------------------------------------------------------------
# 4. Save cleaned data
# -------------------------------------------------------------
train.to_csv("../data/processed_v2/train_features.csv", index=False)
test.to_csv("../data/processed_v2/test_features.csv", index=False)

print("\nSaved cleaned datasets to processed_v2/")

Requested to drop: 19
Actually dropped:  19
Columns that will be dropped:
  - Ticket_strong
  - TicketPrefix
  - CabinDeck
  - AgeBand
  - FareBand
  - Sex_Pclass
  - Sex_AgeBand
  - Embarked_Pclass
  - Title_AgeBand
  - Pclass_FareBand
  - AgeMissing
  - FareMissing
  - CabinKnown
  - FamilySize
  - IsAlone
  - LogFare
  - FarePerPerson
  - TicketGroupSize
  - TicketGroupFarePerPerson

Saved cleaned datasets to processed_v2/


In [3]:
import json

# Load metadata
with open("../data/processed/processed_metadata.json") as f:
    meta = json.load(f)

keys_to_drop = [
    # Categorical engineered features
    "Ticket_strong",
    "TicketPrefix",
    "CabinDeck",
    "AgeBand",
    "FareBand",
    "Sex_Pclass",
    "Sex_AgeBand",
    "Embarked_Pclass",
    "Title_AgeBand",
    "Pclass_FareBand",

    # Numeric engineered features
    "AgeMissing",
    "FareMissing",
    "CabinKnown",
    "FamilySize",
    "IsAlone",
    "LogFare",
    "FarePerPerson",
    "TicketGroupSize",
    "TicketGroupFarePerPerson"
]

# Track what is actually removed
removed = []
missing = []

for key in keys_to_drop:
    if key in meta:
        del meta[key]
        removed.append(key)
    else:
        missing.append(key)

# Save new metadata
with open("../data/processed_v2/processed_metadata.json", "w") as f:
    json.dump(meta, f, indent=4)

# Summary report
print(f"Requested to remove: {len(keys_to_drop)} metadata keys")
print(f"Actually removed:    {len(removed)}")
print(f"Missing (not found): {len(missing)}\n")

if removed:
    print("Removed keys:")
    for k in removed:
        print(f"  - {k}")

if missing:
    print("\nKeys not found (already removed or never existed):")
    for k in missing:
        print(f"  - {k}")

print("\nUpdated metadata saved to processed_v2.")

Requested to remove: 19 metadata keys
Actually removed:    10
Missing (not found): 9

Removed keys:
  - Ticket_strong
  - TicketPrefix
  - CabinDeck
  - AgeBand
  - FareBand
  - Sex_Pclass
  - Sex_AgeBand
  - Embarked_Pclass
  - Title_AgeBand
  - Pclass_FareBand

Keys not found (already removed or never existed):
  - AgeMissing
  - FareMissing
  - CabinKnown
  - FamilySize
  - IsAlone
  - LogFare
  - FarePerPerson
  - TicketGroupSize
  - TicketGroupFarePerPerson

Updated metadata saved to processed_v2.


In [4]:
train_v2 = pd.read_csv("../data/processed_v2/train_features.csv")
test_v2 = pd.read_csv("../data/processed_v2/test_features.csv")

with open("../data/processed_v2/processed_metadata.json") as f:
    meta_v2 = json.load(f)

print("Metadata keys:", list(meta_v2.keys()))

Metadata keys: ['Sex', 'Embarked', 'Title']


In [5]:
cat_cols_in_data = train_v2.select_dtypes(include=["object", "category"]).columns.tolist()

print("\nCategorical columns in train_v2:")
print(cat_cols_in_data)

print("\nMetadata keys (from processed_v2/processed_metadata.json):")
print(list(meta_v2.keys()))

missing_in_meta = set(cat_cols_in_data) - set(meta_v2.keys())
extra_in_meta   = set(meta_v2.keys()) - set(cat_cols_in_data)

print("\nCategorical columns missing from metadata:", missing_in_meta)
print("Metadata keys not present as columns:", extra_in_meta)


Categorical columns in train_v2:
['Sex', 'Embarked', 'Title']

Metadata keys (from processed_v2/processed_metadata.json):
['Sex', 'Embarked', 'Title']

Categorical columns missing from metadata: set()
Metadata keys not present as columns: set()
