In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle

In [2]:
df = pd.read_csv('Data/feature_engineered_data_revisited.csv', index_col=0)

In [3]:


# List of columns to keep
columns_to_keep = [
    # Numeric / continuous
    'Car_Age',
    'Mileage_per_Year',
    'Engine',
    'Brand_Strength',       
    'Remaining_Life',      
    
    
    
    # Categorical
    'Fuel type',
    'Body type',
    'Gearbox',
    'Emission Class',
    'Brand',
    'Lifecycle_Stage',
    'Depreciation_Phase',
    
    # Target
    'Log_Price'
]

# Drop all other columns
df = df[columns_to_keep].copy()



print("\nRemaining columns:", df.columns.tolist())



Remaining columns: ['Car_Age', 'Mileage_per_Year', 'Engine', 'Brand_Strength', 'Remaining_Life', 'Fuel type', 'Body type', 'Gearbox', 'Emission Class', 'Brand', 'Lifecycle_Stage', 'Depreciation_Phase', 'Log_Price']


In [4]:


# One-hot encode categorical columns
one_hot_cols = [
    'Fuel type',
    'Body type',
    'Gearbox',
    'Emission Class',
    'Lifecycle_Stage',
    'Depreciation_Phase'
]

df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True) 

# Encode Brand 

brand_target_mean = df.groupby('Brand')['Log_Price'].mean()
df['Brand_Target_Enc'] = df['Brand'].map(brand_target_mean)
df.drop(columns=['Brand'], inplace=True)

## Test/Train Split

In [5]:
X = df.drop(columns=['Log_Price'])
y = df['Log_Price']   

In [6]:

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=23
)


In [7]:
with open("Data/train_test_split_revisited.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)