## data loading

In [1]:

import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# set options for display
pd.set_option("display.max_columns", 120)

# possible data directories
POSSIBLE_DIRS = [
    "/kaggle/input/spaceship-titanic",
    "from_kaggle",
    "/kaggle/working",
    "/kaggle/input",
]

def find_raw_paths():
    for d in POSSIBLE_DIRS:
        train_candidate = os.path.join(d, "train.csv")
        test_candidate = os.path.join(d, "test.csv")
        if os.path.exists(train_candidate) and os.path.exists(test_candidate):
            return train_candidate, test_candidate
    raise FileNotFoundError("could not find train.csv and test.csv in known locations")

train_path, test_path = find_raw_paths()
print("using train path:", train_path)
print("using test  path:", test_path)

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print("train shape:", train.shape)
print("test shape :", test.shape)
train.head()


using train path: from_kaggle/train.csv
using test  path: from_kaggle/test.csv
train shape: (8693, 14)
test shape : (4277, 13)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## combine train and test

In [2]:

target_col = "Transported"

# ensure transported is 0/1 numeric in train
if train[target_col].dtype == bool:
    train[target_col] = train[target_col].astype(int)
elif train[target_col].dtype == object:
    mapping = {"True": 1, "False": 0, "true": 1, "false": 0}
    train[target_col] = train[target_col].map(mapping).astype(int)
else:
    train[target_col] = train[target_col].astype(int)

train["is_train"] = 1
test["is_train"] = 0

if target_col not in test.columns:
    test[target_col] = np.nan

full = pd.concat([train, test], ignore_index=True)
print("full shape:", full.shape)
full.head()


full shape: (12970, 15)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,is_train
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0.0,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1.0,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0.0,1
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0.0,1
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1.0,1


## basic feature engineering

In [3]:
# split passenger id into group and number
def split_passenger_id(df):
    pid = df["PassengerId"].astype(str).str.split("_", expand=True)
    df["PassengerGroup"] = pid[0]
    df["PassengerNumber"] = pd.to_numeric(pid[1], errors="coerce")

# split cabin into deck / number / side
def split_cabin(df):
    cabin_parts = df["Cabin"].astype(str).str.split("/", expand=True)
    df["CabinDeck"] = cabin_parts[0].replace("nan", np.nan)
    df["CabinNum"] = pd.to_numeric(cabin_parts[1], errors="coerce")
    df["CabinSide"] = cabin_parts[2].replace("nan", np.nan)

# split name into given names and surname
def split_name(df):
    names = df["Name"].fillna("Unknown Unknown")
    name_parts = names.str.rsplit(" ", n=1, expand=True)
    df["GivenNames"] = name_parts[0]
    df["Surname"] = name_parts[1]

split_passenger_id(full)
split_cabin(full)
split_name(full)

# spend features
spend_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
full["TotalSpend"] = full[spend_cols].sum(axis=1)

for col in spend_cols + ["TotalSpend"]:
    full[f"{col}_log"] = np.log1p(full[col])

full[["PassengerId", "PassengerGroup", "PassengerNumber", "Cabin", "CabinDeck", "CabinNum", "CabinSide", "Name", "GivenNames", "Surname", "TotalSpend"]].head()


Unnamed: 0,PassengerId,PassengerGroup,PassengerNumber,Cabin,CabinDeck,CabinNum,CabinSide,Name,GivenNames,Surname,TotalSpend
0,0001_01,1,1,B/0/P,B,0.0,P,Maham Ofracculy,Maham,Ofracculy,0.0
1,0002_01,2,1,F/0/S,F,0.0,S,Juanna Vines,Juanna,Vines,736.0
2,0003_01,3,1,A/0/S,A,0.0,S,Altark Susent,Altark,Susent,10383.0
3,0003_02,3,2,A/0/S,A,0.0,S,Solam Susent,Solam,Susent,5176.0
4,0004_01,4,1,F/1/S,F,1.0,S,Willy Santantines,Willy,Santantines,1091.0


## Improved feature engineering

In [4]:
# group level features
full["group_size"] = full.groupby("PassengerGroup")["PassengerGroup"].transform("count")
full["group_total_spend"] = full.groupby("PassengerGroup")["TotalSpend"].transform("sum")
full["group_avg_spend"] = full.groupby("PassengerGroup")["TotalSpend"].transform("mean")

# group cryosleep fraction
full["group_cryo_frac"] = full.groupby("PassengerGroup")["CryoSleep"].transform(lambda s: s.astype(float).mean())

# surname features
full["surname_size"] = full.groupby("Surname")["Surname"].transform("count")
full["surname_avg_spend"] = full.groupby("Surname")["TotalSpend"].transform("mean")

# spend pattern indicators
full["spent_any"] = (full["TotalSpend"] > 0).astype(int)
full["spent_room"] = (full["RoomService"] > 0).astype(int)
full["spent_food"] = (full["FoodCourt"] > 0).astype(int)
full["spent_shopping"] = (full["ShoppingMall"] > 0).astype(int)
full["spent_spa"] = (full["Spa"] > 0).astype(int)
full["spent_vr"] = (full["VRDeck"] > 0).astype(int)

# deck / side combination
full["deck_side"] = (full["CabinDeck"].fillna("UNK") + "_" + full["CabinSide"].fillna("UNK")).astype(str)

# age bining
full["age_bin"] = pd.cut(full["Age"], bins=[-1, 12, 18, 30, 50, 100], labels=False)

# cryosleep and zero spend interaction
full["cryo_zero_spend"] = ((full["CryoSleep"] == True) & (full["TotalSpend"] == 0)).astype(int)

full[["PassengerGroup", "group_size", "group_total_spend", "group_cryo_frac", "Surname", "surname_size", "spent_any", "deck_side", "age_bin", "cryo_zero_spend"]].head()


Unnamed: 0,PassengerGroup,group_size,group_total_spend,group_cryo_frac,Surname,surname_size,spent_any,deck_side,age_bin,cryo_zero_spend
0,1,1,0.0,0.0,Ofracculy,3,0,B_P,3.0,0
1,2,1,736.0,0.0,Vines,4,1,F_S,2.0,0
2,3,2,15559.0,0.0,Susent,7,1,A_S,4.0,0
3,3,2,15559.0,0.0,Susent,7,1,A_S,3.0,0
4,4,1,1091.0,0.0,Santantines,9,1,F_S,1.0,0


## missing flags

In [5]:

important_missing_cols = [
    "HomePlanet",
    "CryoSleep",
    "Cabin",
    "Destination",
    "Age",
] + spend_cols

for col in important_missing_cols:
    flag_col = f"{col}_was_missing"
    full[flag_col] = full[col].isna().astype(int)

missing_flag_cols = [c for c in full.columns if c.endswith("_was_missing")]
print("number of missingness flags:", len(missing_flag_cols))
full[missing_flag_cols].head()


number of missingness flags: 10


Unnamed: 0,HomePlanet_was_missing,CryoSleep_was_missing,Cabin_was_missing,Destination_was_missing,Age_was_missing,RoomService_was_missing,FoodCourt_was_missing,ShoppingMall_was_missing,Spa_was_missing,VRDeck_was_missing
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0


## Data Imputation

In [6]:
# identify types
cat_cols = full.select_dtypes(include="object").columns.tolist()
num_cols = full.select_dtypes(include=["int64", "float64", "bool"]).columns.tolist()

# do not treat helper cols as regular numeric for some operations
num_cols = [c for c in num_cols if c not in ["is_train"]]

print("categorical columns:", len(cat_cols))
print("numeric columns    :", len(num_cols))

# categorical mode imputation
for col in cat_cols:
    mode_val = full[col].mode(dropna=True)
    if len(mode_val) == 0:
        continue
    full[col] = full[col].fillna(mode_val[0])

# numeric median imputation
for col in num_cols:
    median_val = full[col].median()
    full[col] = full[col].fillna(median_val)

print("remaining missing values:", full.isna().sum().sum())


categorical columns: 13
numeric columns    : 40
remaining missing values: 0


## encoding

In [7]:

full_encoded = pd.get_dummies(full, columns=cat_cols, drop_first=False)
print("shape after encoding:", full_encoded.shape)
full_encoded.head()


shape after encoding: (12970, 50073)


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,is_train,PassengerNumber,CabinNum,TotalSpend,RoomService_log,FoodCourt_log,ShoppingMall_log,Spa_log,VRDeck_log,TotalSpend_log,group_size,group_total_spend,group_avg_spend,group_cryo_frac,surname_size,surname_avg_spend,spent_any,spent_room,spent_food,spent_shopping,spent_spa,spent_vr,age_bin,cryo_zero_spend,HomePlanet_was_missing,CryoSleep_was_missing,Cabin_was_missing,Destination_was_missing,Age_was_missing,RoomService_was_missing,FoodCourt_was_missing,ShoppingMall_was_missing,Spa_was_missing,VRDeck_was_missing,PassengerId_0001_01,PassengerId_0002_01,PassengerId_0003_01,PassengerId_0003_02,PassengerId_0004_01,PassengerId_0005_01,PassengerId_0006_01,PassengerId_0006_02,PassengerId_0007_01,PassengerId_0008_01,PassengerId_0008_02,PassengerId_0008_03,PassengerId_0009_01,PassengerId_0010_01,PassengerId_0011_01,PassengerId_0012_01,PassengerId_0013_01,PassengerId_0014_01,PassengerId_0015_01,...,Surname_Wilkinner,Surname_Willangsey,Surname_Williotters,Surname_Willy,Surname_Willynnedy,Surname_Wilsoney,Surname_Win,Surname_Wincer,Surname_Windend,Surname_Wingcoling,Surname_Winie,Surname_Winsley,Surname_Wirdley,Surname_Wirybody,Surname_Wist,Surname_Wiste,Surname_Witalnerod,Surname_Witeronfus,Surname_Witicheal,Surname_Witive,Surname_Wolfaddox,Surname_Wolferguson,Surname_Wolfernan,Surname_Wolferton,Surname_Wolffy,Surname_Wonglasquez,Surname_Woodgezalez,Surname_Woodwardy,Surname_Woodwin,Surname_Woodwinez,Surname_Woodwinton,Surname_Woody,Surname_Wooes,Surname_Wooterston,Surname_Workmans,Surname_Workmanson,Surname_Wrempeedly,Surname_Wriggins,Surname_Wynneyerson,Surname_Yanton,Surname_Yatters,Surname_Yorkland,Surname_Youngrayes,deck_side_A_P,deck_side_A_S,deck_side_B_P,deck_side_B_S,deck_side_C_P,deck_side_C_S,deck_side_D_P,deck_side_D_S,deck_side_E_P,deck_side_E_S,deck_side_F_P,deck_side_F_S,deck_side_G_P,deck_side_G_S,deck_side_T_P,deck_side_T_S,deck_side_UNK_UNK
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,3,5725.333333,0,0,0,0,0,0,3.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,24.0,109.0,9.0,25.0,549.0,44.0,1.0,1,1,0.0,736.0,4.70048,2.302585,3.258097,6.309918,3.806662,6.602588,1,736.0,736.0,0.0,4,628.25,1,1,1,1,1,1,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,1,1,0.0,10383.0,3.78419,8.18228,0.0,8.812248,3.912023,9.248021,2,15559.0,7779.5,0.0,7,3854.571429,1,1,1,0,1,1,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,1,2,0.0,5176.0,0.0,7.157735,5.918894,8.110728,5.267858,8.551981,2,15559.0,7779.5,0.0,7,3854.571429,1,0,1,1,1,1,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,16.0,303.0,70.0,151.0,565.0,2.0,1.0,1,1,1.0,1091.0,5.717028,4.26268,5.023881,6.338594,1.098612,6.995766,1,1091.0,1091.0,0.0,9,520.222222,1,1,1,1,1,1,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


## Scaling

In [8]:
scaler = StandardScaler()

numeric_for_scaling = []
for c in full_encoded.columns:
    if c in num_cols and c not in ["is_train", target_col]:
        numeric_for_scaling.append(c)
    elif c.endswith("_log"):
        numeric_for_scaling.append(c)

print("numeric features to scale:", len(numeric_for_scaling))

if numeric_for_scaling:
    full_encoded[numeric_for_scaling] = scaler.fit_transform(full_encoded[numeric_for_scaling])

full_encoded.head()


numeric features to scale: 39


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,is_train,PassengerNumber,CabinNum,TotalSpend,RoomService_log,FoodCourt_log,ShoppingMall_log,Spa_log,VRDeck_log,TotalSpend_log,group_size,group_total_spend,group_avg_spend,group_cryo_frac,surname_size,surname_avg_spend,spent_any,spent_room,spent_food,spent_shopping,spent_spa,spent_vr,age_bin,cryo_zero_spend,HomePlanet_was_missing,CryoSleep_was_missing,Cabin_was_missing,Destination_was_missing,Age_was_missing,RoomService_was_missing,FoodCourt_was_missing,ShoppingMall_was_missing,Spa_was_missing,VRDeck_was_missing,PassengerId_0001_01,PassengerId_0002_01,PassengerId_0003_01,PassengerId_0003_02,PassengerId_0004_01,PassengerId_0005_01,PassengerId_0006_01,PassengerId_0006_02,PassengerId_0007_01,PassengerId_0008_01,PassengerId_0008_02,PassengerId_0008_03,PassengerId_0009_01,PassengerId_0010_01,PassengerId_0011_01,PassengerId_0012_01,PassengerId_0013_01,PassengerId_0014_01,PassengerId_0015_01,...,Surname_Wilkinner,Surname_Willangsey,Surname_Williotters,Surname_Willy,Surname_Willynnedy,Surname_Wilsoney,Surname_Win,Surname_Wincer,Surname_Windend,Surname_Wingcoling,Surname_Winie,Surname_Winsley,Surname_Wirdley,Surname_Wirybody,Surname_Wist,Surname_Wiste,Surname_Witalnerod,Surname_Witeronfus,Surname_Witicheal,Surname_Witive,Surname_Wolfaddox,Surname_Wolferguson,Surname_Wolfernan,Surname_Wolferton,Surname_Wolffy,Surname_Wonglasquez,Surname_Woodgezalez,Surname_Woodwardy,Surname_Woodwin,Surname_Woodwinez,Surname_Woodwinton,Surname_Woody,Surname_Wooes,Surname_Wooterston,Surname_Workmans,Surname_Workmanson,Surname_Wrempeedly,Surname_Wriggins,Surname_Wynneyerson,Surname_Yanton,Surname_Yatters,Surname_Yorkland,Surname_Youngrayes,deck_side_A_P,deck_side_A_S,deck_side_B_P,deck_side_B_S,deck_side_C_P,deck_side_C_S,deck_side_D_P,deck_side_D_S,deck_side_E_P,deck_side_E_S,deck_side_F_P,deck_side_F_S,deck_side_G_P,deck_side_G_S,deck_side_T_P,deck_side_T_S,deck_side_UNK_UNK
0,0.720932,-0.34029,-0.281822,-0.292365,-0.269707,-0.2571,0.0,1,-0.490655,-1.181321,-0.510541,-0.640372,-0.648903,-0.624889,-0.663234,-0.63481,-1.151868,-0.648668,-0.552517,-0.614589,-0.897587,-0.25043,2.503511,-1.173356,-0.717026,-0.73324,-0.708293,-0.760297,-0.722704,0.796006,-0.738967,-0.150696,-0.156482,-0.153614,-0.146907,-0.145808,-0.143865,-0.150964,-0.155444,-0.149622,-0.145255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,-0.332557,-0.170439,-0.276082,-0.249566,0.22104,-0.219449,1.0,1,-0.490655,-1.181321,-0.248363,1.086538,0.136898,0.641264,1.620304,0.755227,0.638416,-0.648668,-0.413575,-0.29898,-0.897587,-0.227078,-0.469525,0.852256,1.39465,1.36381,1.411845,1.315276,1.383693,-0.162423,-0.738967,-0.150696,-0.156482,-0.153614,-0.146907,-0.145808,-0.143865,-0.150964,-0.155444,-0.149622,-0.145255,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,2.05535,-0.273285,1.998823,-0.292365,5.732776,-0.21517,0.0,1,-0.490655,-1.181321,3.188082,0.749902,2.143455,-0.624889,2.525889,0.7937,1.355722,-0.014569,2.384707,2.721391,-0.897587,-0.157023,1.41233,0.852256,1.39465,1.36381,-0.708293,1.315276,1.383693,1.754436,-0.738967,-0.150696,-0.156482,-0.153614,-0.146907,-0.145808,-0.143865,-0.150964,-0.155444,-0.149622,-0.145255,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.299536,-0.34029,0.536429,0.342766,2.706059,-0.091947,0.0,1,0.468615,-1.181321,1.333249,-0.640372,1.79381,1.675295,2.272011,1.288796,1.166992,-0.014569,2.384707,2.721391,-0.897587,-0.157023,1.41233,0.852256,-0.717026,1.36381,1.411845,1.315276,1.383693,0.796006,-0.738967,-0.150696,-0.156482,-0.153614,-0.146907,-0.145808,-0.143865,-0.150964,-0.155444,-0.149622,-0.145255,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,-0.894417,0.131863,-0.237179,-0.033861,0.235342,-0.255389,1.0,1,-0.490655,-1.179351,-0.121906,1.460007,0.805817,1.327477,1.630682,-0.233642,0.745026,-0.648668,-0.346558,-0.14675,-0.897587,-0.11032,-0.532536,0.852256,1.39465,1.36381,1.411845,1.315276,1.383693,-1.120853,-0.738967,-0.150696,-0.156482,-0.153614,-0.146907,-0.145808,-0.143865,-0.150964,-0.155444,-0.149622,-0.145255,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


## split and save

In [10]:

train_clean_v2 = full_encoded[full_encoded["is_train"] == 1].copy()
test_clean_v2 = full_encoded[full_encoded["is_train"] == 0].copy()

train_clean_v2.drop(columns=["is_train"], inplace=True)
test_clean_v2.drop(columns=["is_train"], inplace=True)

if target_col in test_clean_v2.columns:
    test_clean_v2.drop(columns=[target_col], inplace=True)

print("train_clean_v2 shape:", train_clean_v2.shape)
print("test_clean_v2 shape :", test_clean_v2.shape)

output_dir = "from_kaggle/cleaned_data"
train_out = os.path.join(output_dir, "train_clean_v2.csv")
test_out = os.path.join(output_dir, "test_clean_v2.csv")

train_clean_v2.to_csv(train_out, index=False)
test_clean_v2.to_csv(test_out, index=False)

print("saved train_clean_v2 to:", train_out)
print("saved test_clean_v2  to:", test_out)


train_clean_v2 shape: (8693, 50072)
test_clean_v2 shape : (4277, 50071)
saved train_clean_v2 to: from_kaggle/cleaned_data/train_clean_v2.csv
saved test_clean_v2  to: from_kaggle/cleaned_data/test_clean_v2.csv
