In [1]:
# CELL 1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# OneHotEncoder and ColumnTransformer will be used later if a mixed-type pipeline is built,
# but for now, pd.get_dummies is simpler to illustrate.
from sklearn.impute import SimpleImputer

import torch

# Load the raw dataset again for a clean start in this notebook
file_path = '../data/raw/used_cars.csv' # Make sure this path is correct
try:
    df = pd.read_csv(file_path)
    print("Raw dataset loaded successfully for feature engineering.")
except FileNotFoundError:
    print(f"Error: Dataset file not found at {file_path}.")
    df = pd.DataFrame() # Create an empty DataFrame

if not df.empty:
    # 1. Re-clean 'price' column (as done successfully in the previous notebook)
    if 'price' in df.columns and df['price'].dtype == 'object':
        df_temp_price = df['price'].astype(str)
        df_temp_price = df_temp_price.str.replace('$', '', regex=False)
        df_temp_price = df_temp_price.str.replace(',', '', regex=False)
        df['price'] = pd.to_numeric(df_temp_price, errors='coerce')
        print(f"'price' column cleaned. Dtype: {df['price'].dtype}, NaNs: {df['price'].isnull().sum()}")

    # 2. Correctly clean 'milage' column
    # Based on your df.head(), format is like '51,000 mi.'
    if 'milage' in df.columns and df['milage'].dtype == 'object':
        df_temp_milage = df['milage'].astype(str)
        df_temp_milage = df_temp_milage.str.replace(' mi.', '', regex=False) # Remove " mi."
        df_temp_milage = df_temp_milage.str.replace(',', '', regex=False)   # Remove commas
        df['milage'] = pd.to_numeric(df_temp_milage, errors='coerce')
        print(f"'milage' column cleaned. Dtype: {df['milage'].dtype}, NaNs: {df['milage'].isnull().sum()}")

    # 3. Re-handle missing 'fuel_type' (as done in previous notebook)
    if 'fuel_type' in df.columns and df['fuel_type'].isnull().any():
        mode_fuel_type = df['fuel_type'].mode()[0]
        df['fuel_type'] = df['fuel_type'].fillna(mode_fuel_type)
        print(f"'fuel_type' NaNs filled with mode: '{mode_fuel_type}'")

    print("\n--- df.info() after initial re-cleaning in new notebook ---")
    df.info()
    print("\n--- Head of df after initial re-cleaning ---")
    print(df.head())

Raw dataset loaded successfully for feature engineering.
'price' column cleaned. Dtype: int64, NaNs: 0
'milage' column cleaned. Dtype: int64, NaNs: 0
'fuel_type' NaNs filled with mode: 'Gasoline'

--- df.info() after initial re-cleaning in new notebook ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         4009 non-null   object
 1   model         4009 non-null   object
 2   model_year    4009 non-null   int64 
 3   milage        4009 non-null   int64 
 4   fuel_type     4009 non-null   object
 5   engine        4009 non-null   object
 6   transmission  4009 non-null   object
 7   ext_col       4009 non-null   object
 8   int_col       4009 non-null   object
 9   accident      3896 non-null   object
 10  clean_title   3413 non-null   object
 11  price         4009 non-null   int64 
dtypes: int64(3), object(9)
memory usage: 376.0+

In [2]:
# CELL 2
if not df.empty:
    # 1. Create 'car_age'
    if 'model_year' in df.columns and pd.api.types.is_numeric_dtype(df['model_year']):
        ref_year = 2025 # Project reference year
        df['car_age'] = ref_year - df['model_year']
        print("\n'car_age' feature created.")
    else:
        print("\nCould not create 'car_age'. 'model_year' column missing or not numeric.")

    # 2. Inspect 'engine' column for further processing
    if 'engine' in df.columns:
        print("\n--- Inspecting 'engine' column ---")
        print("Unique 'engine' values (sample):")
        print(df['engine'].value_counts().head(10))
        # Based on these values, you'll decide how to extract features (e.g., displacement, HP)
        # or how to categorize/encode it. This might involve string splitting, regex, etc.
        # For now, we'll assume it will be treated as a high-cardinality categorical or simplified later.
    
    print("\n--- df.info() after feature engineering ---")
    df.info()
    print("\n--- Head of df after feature engineering ---")
    print(df[['model_year', 'car_age', 'engine']].head())


'car_age' feature created.

--- Inspecting 'engine' column ---
Unique 'engine' values (sample):
engine
2.0L I4 16V GDI DOHC Turbo                               52
355.0HP 5.3L 8 Cylinder Engine Gasoline Fuel             48
420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel             47
–                                                        45
300.0HP 3.0L Straight 6 Cylinder Engine Gasoline Fuel    44
240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel             42
285.0HP 3.6L V6 Cylinder Engine Gasoline Fuel            40
5.7L V8 16V MPFI OHV                                     29
340.0HP 3.0L V6 Cylinder Engine Gasoline Fuel            28
3.6L V6 24V MPFI DOHC                                    28
Name: count, dtype: int64

--- df.info() after feature engineering ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         4009 non-null  

In [3]:
##CELL 3
if not df.empty:
    # Drop rows where 'price' or 'milage' are NaN after cleaning attempts
    # as these are critical.
    cols_to_check_for_nan_drop = ['price', 'milage']
    if all(col in df.columns for col in cols_to_check_for_nan_drop):
        original_rows = len(df)
        df.dropna(subset=cols_to_check_for_nan_drop, inplace=True)
        print(f"\nDropped {original_rows - len(df)} rows due to NaNs in 'price' or 'milage'.")
    
    # Handle missing values for 'accident' and 'clean_title' (e.g., with mode or 'Unknown')
    for col in ['accident', 'clean_title']:
        if col in df.columns and df[col].isnull().any():
            # Option 1: Fill with mode
            mode_val = df[col].mode()[0]
            df[col] = df[col].fillna(mode_val)
            print(f"Missing values in '{col}' imputed with mode: '{mode_val}'")
            # Option 2: Fill with a specific category like 'Unknown'
            # df[col] = df[col].fillna('Unknown')
            # print(f"Missing values in '{col}' imputed with 'Unknown'")

    print("\n--- df.info() after handling remaining missing values ---")
    df.info()
    print("\nMissing values count after all NaN handling:")
    print(df.isnull().sum())


Dropped 0 rows due to NaNs in 'price' or 'milage'.
Missing values in 'accident' imputed with mode: 'None reported'
Missing values in 'clean_title' imputed with mode: 'Yes'

--- df.info() after handling remaining missing values ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         4009 non-null   object
 1   model         4009 non-null   object
 2   model_year    4009 non-null   int64 
 3   milage        4009 non-null   int64 
 4   fuel_type     4009 non-null   object
 5   engine        4009 non-null   object
 6   transmission  4009 non-null   object
 7   ext_col       4009 non-null   object
 8   int_col       4009 non-null   object
 9   accident      4009 non-null   object
 10  clean_title   4009 non-null   object
 11  price         4009 non-null   int64 
 12  car_age       4009 non-null   int64 
dtypes: int64(4), object(9)
mem

In [4]:
# CELL 4
if not df.empty:
    # 1. Process 'accident' column (example: map to binary)
    if 'accident' in df.columns:
        print("\n--- Processing 'accident' column ---")
        print("Original 'accident' values:")
        print(df['accident'].value_counts(dropna=False))
        # Example: If values are like "At least 1 accident..." vs "None reported"
        # Create a binary 'has_accident' column (1 if accident, 0 otherwise)
        df['has_accident'] = df['accident'].apply(lambda x: 0 if str(x).lower() == 'none reported' else 1)
        print("'has_accident' (binary) feature created.")
        # df.drop('accident', axis=1, inplace=True) # Optionally drop original

    # 2. Process 'clean_title' column (example: map to binary)
    if 'clean_title' in df.columns:
        print("\n--- Processing 'clean_title' column ---")
        print("Original 'clean_title' values:")
        print(df['clean_title'].value_counts(dropna=False))
        # Example: If values are 'Yes', 'No'
        df['has_clean_title'] = df['clean_title'].apply(lambda x: 1 if str(x).lower() == 'yes' else 0)
        print("'has_clean_title' (binary) feature created.")
        # df.drop('clean_title', axis=1, inplace=True) # Optionally drop original

    # 3. Process 'engine' column (example: simplify or keep as is for OHE)
    if 'engine' in df.columns:
        print("\n--- Further processing/simplifying 'engine' column (example) ---")
        # If you couldn't extract reliable numerical features, you might:
        # a) Keep it as is if the number of unique values is manageable for OHE.
        # b) Simplify it: e.g., extract first few words, or group less frequent categories.
        # For this example, we'll assume it's kept as 'engine' for now for OHE.
        # If you created numerical engine features, ensure they are numeric and handle their NaNs.
        pass # Add your simplification logic if needed

    print("\n--- df.info() after specific categorical processing ---")
    df.info()
    print(df[['has_accident', 'has_clean_title', 'engine']].head(10)) # Check new columns


--- Processing 'accident' column ---
Original 'accident' values:
accident
None reported                             3023
At least 1 accident or damage reported     986
Name: count, dtype: int64
'has_accident' (binary) feature created.

--- Processing 'clean_title' column ---
Original 'clean_title' values:
clean_title
Yes    4009
Name: count, dtype: int64
'has_clean_title' (binary) feature created.

--- Further processing/simplifying 'engine' column (example) ---

--- df.info() after specific categorical processing ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   brand            4009 non-null   object
 1   model            4009 non-null   object
 2   model_year       4009 non-null   int64 
 3   milage           4009 non-null   int64 
 4   fuel_type        4009 non-null   object
 5   engine           4009 non-null   object
 6   tra

In [5]:
# CELL 5
if not df.empty:
    df_to_encode = df.copy() # Work on a copy

    # Identify categorical columns for one-hot encoding
    # This will include 'brand', 'model', 'fuel_type_cleaned', 'transmission',
    # 'ext_col', 'int_col', and the processed 'engine' if it's still categorical.
    # 'accident' and 'clean_title' might have been converted to binary (0/1) already.
    # If so, exclude them here if they are already numeric.

    categorical_cols = df_to_encode.select_dtypes(include=['object']).columns.tolist()
    
    # If 'accident' and 'clean_title' were original text columns you want to drop now
    # after creating binary versions, do it before get_dummies:
    # cols_to_drop_before_ohe = ['accident', 'clean_title', 'model_year'] # also model_year if car_age is used
    # df_to_encode.drop(columns=cols_to_drop_before_ohe, axis=1, inplace=True, errors='ignore')
    # Re-identify categoricals if columns were dropped
    # categorical_cols = df_to_encode.select_dtypes(include=['object']).columns.tolist()


    if categorical_cols:
        print(f"\nCategorical columns to be one-hot encoded: {categorical_cols}")
        df_processed = pd.get_dummies(df_to_encode, columns=categorical_cols, drop_first=True, dummy_na=False)
        print(f"Shape after one-hot encoding: {df_processed.shape}")
        print("DataFrame head after encoding:")
        print(df_processed.head())
    else:
        print("\nNo categorical columns found for one-hot encoding.")
        df_processed = df_to_encode # If no object columns left

    # At this point, df_processed should ideally contain only numeric types (int, float, bool/uint8 from dummies)
    # plus your target 'price'.
    print("\n--- Data types in df_processed (should be numeric + target) ---")
    print(df_processed.dtypes.value_counts())
    df_processed.info() # Final check before splitting


Categorical columns to be one-hot encoded: ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
Shape after one-hot encoding: (4009, 3645)
DataFrame head after encoding:
   model_year  milage  price  car_age  has_accident  has_clean_title  \
0        2013   51000  10300       12             1                1   
1        2021   34742  38005        4             1                1   
2        2022   22372  54598        3             0                1   
3        2015   88900  15500       10             0                1   
4        2021    9835  34999        4             0                1   

   brand_Alfa  brand_Aston  brand_Audi  brand_BMW  ...  int_col_Tupelo  \
0       False        False       False      False  ...           False   
1       False        False       False      False  ...           False   
2       False        False       False      False  ...           False   
3       False        False       False      Fa

In [6]:
# CELL 6
if 'df_processed' in locals() and not df_processed.empty and 'price' in df_processed.columns:
    y = df_processed['price']
    # Drop original columns if new features were created from them and originals are no longer needed
    # e.g., if 'has_accident' exists, drop original 'accident' string column if it's still there.
    # Ensure 'model_year' is dropped if 'car_age' is used and 'model_year' itself isn't a desired feature.
    columns_to_drop_for_X = ['price', 'model_year', 'accident', 'clean_title'] # Add any other originals
    
    X = df_processed.drop(columns=[col for col in columns_to_drop_for_X if col in df_processed.columns], axis=1)

    print(f"\nTarget variable 'y' (price) created. Shape: {y.shape}")
    print(f"Features 'X' created. Shape: {X.shape}")

    # Final check: Ensure all columns in X are numeric
    non_numeric_cols_in_X = X.select_dtypes(exclude=np.number).columns
    if len(non_numeric_cols_in_X) > 0:
        print(f"WARNING: X still contains non-numeric columns: {non_numeric_cols_in_X.tolist()}")
        print("These MUST be dropped or further processed before scaling/modeling.")
        # X = X.select_dtypes(include=np.number) # Force drop
    else:
        print("All columns in X are numeric and ready for scaling/training.")
    
    print("\nFirst 5 rows of X:")
    print(X.head())
else:
    print("\n'df_processed' not available or 'price' column missing. Cannot define X and y.")


Target variable 'y' (price) created. Shape: (4009,)
Features 'X' created. Shape: (4009, 3643)
These MUST be dropped or further processed before scaling/modeling.

First 5 rows of X:
   milage  car_age  has_accident  has_clean_title  brand_Alfa  brand_Aston  \
0   51000       12             1                1       False        False   
1   34742        4             1                1       False        False   
2   22372        3             0                1       False        False   
3   88900       10             0                1       False        False   
4    9835        4             0                1       False        False   

   brand_Audi  brand_BMW  brand_Bentley  brand_Bugatti  ...  int_col_Tupelo  \
0       False      False          False          False  ...           False   
1       False      False          False          False  ...           False   
2       False      False          False          False  ...           False   
3       False      False        

In [7]:
# CELL 7
if 'X' in locals() and 'y' in locals() and not X.empty:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Identify numerical features to scale from X_train (should be all columns in X by now)
    numerical_features_in_X = X_train.columns.tolist() # Assuming X contains only numeric features now

    if numerical_features_in_X:
        print(f"\nNumerical features to scale (all columns in X): {len(numerical_features_in_X)} features")
        scaler = StandardScaler()
        
        # Create copies to avoid SettingWithCopyWarning
        X_train_scaled = X_train.copy()
        X_test_scaled = X_test.copy()

        X_train_scaled[numerical_features_in_X] = scaler.fit_transform(X_train[numerical_features_in_X])
        X_test_scaled[numerical_features_in_X] = scaler.transform(X_test[numerical_features_in_X])
        
        print("Train and test data split, and numerical features scaled.")
        print("X_train_scaled shape:", X_train_scaled.shape)
        print("X_test_scaled shape:", X_test_scaled.shape)
    else:
        print("\nNo numerical features found in X_train for scaling. Using unscaled data.")
        X_train_scaled = X_train.copy()
        X_test_scaled = X_test.copy()
else:
    print("\nX and/or y not available or X is empty. Cannot split and scale data.")


Numerical features to scale (all columns in X): 3643 features
Train and test data split, and numerical features scaled.
X_train_scaled shape: (3207, 3643)
X_test_scaled shape: (802, 3643)


In [8]:
# CELL 8
if 'X_train_scaled' in locals():
    X_train_np = X_train_scaled.values
    y_train_np = y_train.values.reshape(-1, 1)
    X_test_np = X_test_scaled.values
    y_test_np = y_test.values.reshape(-1, 1)

    print("\nData converted to NumPy arrays:")
    print("X_train_np shape:", X_train_np.shape)
    print("y_train_np shape:", y_train_np.shape)
else:
    print("\nScaled training/testing data not available for NumPy conversion.")


Data converted to NumPy arrays:
X_train_np shape: (3207, 3643)
y_train_np shape: (3207, 1)


In [9]:
# CELL 9
if 'X_train_np' in locals():
    X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32)

    print("\nData converted to PyTorch Tensors:")
    print("X_train_tensor shape:", X_train_tensor.shape, "dtype:", X_train_tensor.dtype)
    print("y_train_tensor shape:", y_train_tensor.shape, "dtype:", y_train_tensor.dtype)
else:
    print("\nNumPy arrays not available for PyTorch Tensor conversion.")


Data converted to PyTorch Tensors:
X_train_tensor shape: torch.Size([3207, 3643]) dtype: torch.float32
y_train_tensor shape: torch.Size([3207, 1]) dtype: torch.float32


In [10]:
# CELL 10
if 'X_train_tensor' in locals():
    import os
    processed_data_path = 'data/processed_tensors' # Ensure this path aligns with your project structure
    os.makedirs(processed_data_path, exist_ok=True)

    torch.save(X_train_tensor, f'{processed_data_path}/X_train_tensor.pt')
    torch.save(y_train_tensor, f'{processed_data_path}/y_train_tensor.pt')
    torch.save(X_test_tensor, f'{processed_data_path}/X_test_tensor.pt')
    torch.save(y_test_tensor, f'{processed_data_path}/y_test_tensor.pt')
    print(f"\nPyTorch Tensors saved to '{processed_data_path}' directory.")
else:
    print("\nTensors not available for saving.")


PyTorch Tensors saved to 'data/processed_tensors' directory.
