# Feature Engineering

## Set Up

In [1]:
# Importing libraries 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
df = pd.read_csv('Data/feature_engineered_data.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,title,Price,Mileage(miles),Registration_Year,Previous Owners,Fuel type,Body type,Engine,Gearbox,Doors,...,Owners_per_Year,Price_per_Seat,Price_per_Year_Age,Door_Category,Is_Family_Car,Brand_Avg_Price,Model_Avg_Price,Engine_per_Seat,Car_Age_Squared,Premium_Age
0,SKODA Fabia,6900,70189,2016,3,Diesel,Hatchback,1.4,Manual,5.0,...,0.428571,1380.0,862.5,Family,1,5569.619048,5444.837838,0.28,49,0
1,Vauxhall Corsa,1495,88585,2008,4,Petrol,Hatchback,1.2,Manual,3.0,...,0.266667,299.0,93.4375,Small,1,4633.532967,4975.198198,0.24,225,0
2,Hyundai i30,949,137000,2011,3,Petrol,Hatchback,1.4,Manual,5.0,...,0.25,189.8,73.0,Family,1,4997.11828,3320.52381,0.28,144,0
3,MINI Hatch,2395,96731,2010,5,Petrol,Hatchback,1.4,Manual,3.0,...,0.384615,598.75,171.071429,Small,0,6480.175258,6124.846154,0.35,169,0
4,Vauxhall Corsa,1000,85000,2013,3,Diesel,Hatchback,1.3,Manual,5.0,...,0.3,200.0,90.909091,Family,1,4633.532967,4975.198198,0.26,100,0


In [4]:
list(df.columns)

['title',
 'Price',
 'Mileage(miles)',
 'Registration_Year',
 'Previous Owners',
 'Fuel type',
 'Body type',
 'Engine',
 'Gearbox',
 'Doors',
 'Seats',
 'Emission Class',
 'Has_Service_History',
 'Mileage',
 'Car_Age',
 'Engine_Bin',
 'Mileage_per_Year',
 'Log_Price',
 'Log_Mileage',
 'Age_Band',
 'title_lower',
 'Is_Premium',
 'Brand',
 'Model',
 'Usage_Level',
 'Expected_Mileage',
 'Mileage_Delta',
 'Owners_per_Year',
 'Price_per_Seat',
 'Price_per_Year_Age',
 'Door_Category',
 'Is_Family_Car',
 'Brand_Avg_Price',
 'Model_Avg_Price',
 'Engine_per_Seat',
 'Car_Age_Squared',
 'Premium_Age']

In [5]:
# Columns to drop 
cols_to_drop = [

    # Target leakage
    'Price',                 # raw target (using Log_Price instead)
    'Price_per_Seat',
    'Price_per_Year_Age',
    'Brand_Avg_Price',
    'Model_Avg_Price',

    # Raw / redundant mileage features
    'Mileage(miles)',
    'Mileage',
    'Expected_Mileage',
    'Mileage_Delta',

    # Weak / noisy features
    'title',
    'title_lower',
    'Doors',
    'Seats',
    'Previous Owners',
    'Owners_per_Year',
    'Door_Category',
    'Engine_per_Seat',

    # Redundant categorical bins (continuous version would lead to greater insight)
    'Age_Band',
    'Engine_Bin',
    'Usage_Level',
]

# Drop 
df = df.drop(columns=cols_to_drop, errors='ignore')




## Mileage Interaction Features 

In [6]:
df['Mileage_Age_Interaction'] = df['Log_Mileage'] * df['Car_Age']

df['Mileage_per_Age'] = df['Log_Mileage'] / (df['Car_Age'] + 1)


## Depreciation Curve Features

In [7]:
df['Age_3_plus'] = np.maximum(0, df['Car_Age'] - 3)

df['Age_5_plus'] = np.maximum(0, df['Car_Age'] - 5)

df['Age_8_plus'] = np.maximum(0, df['Car_Age'] - 8)


## Engine Interaction Features 

In [8]:
df['Engine_Age'] = df['Engine'] * df['Car_Age']

df['Engine_per_Year'] = df['Engine'] / (df['Car_Age'] + 1)

## Mileage Normalisation Features 

In [9]:
AVG_MILES_PER_YEAR = 12000

df['Mileage_Ratio'] = df['Mileage_per_Year'] / AVG_MILES_PER_YEAR

df['Mileage_Above_Normal'] = df['Mileage_per_Year'] - AVG_MILES_PER_YEAR

## Lifecycle Features 

In [10]:
df['Is_New'] = (df['Car_Age'] <= 2).astype(int)

df['Is_Old'] = (df['Car_Age'] >= 10).astype(int)


## Vehicle Lifespan Features 

In [11]:


# Lifecycle stage categories
df['Lifecycle_Stage'] = pd.cut(
    df['Car_Age'],
    bins=[-1, 2, 5, 10, 100],
    labels=['New', 'Early', 'Mid', 'Late']
)

# Binary lifecycle flags (trees love these)
df['Is_New'] = (df['Car_Age'] <= 2).astype(int)

df['Is_Early_Life'] = ((df['Car_Age'] > 2) & (df['Car_Age'] <= 5)).astype(int)

df['Is_Mid_Life'] = ((df['Car_Age'] > 5) & (df['Car_Age'] <= 10)).astype(int)

df['Is_End_of_Life'] = (df['Car_Age'] > 10).astype(int)

# Remaining useful life proxy (assume 15-year economic lifespan)
df['Remaining_Life'] = np.maximum(0, 15 - df['Car_Age'])

# Depreciation intensity
df['Depreciation_Phase'] = df['Car_Age'] / 15


In [12]:
df.head()

Unnamed: 0,Registration_Year,Fuel type,Body type,Engine,Gearbox,Emission Class,Has_Service_History,Car_Age,Mileage_per_Year,Log_Price,...,Mileage_Ratio,Mileage_Above_Normal,Is_New,Is_Old,Lifecycle_Stage,Is_Early_Life,Is_Mid_Life,Is_End_of_Life,Remaining_Life,Depreciation_Phase
0,2016,Diesel,Hatchback,1.4,Manual,Euro 6,0,7,10027.0,8.839277,...,0.835583,-1973.0,0,0,Mid,0,1,0,8,0.466667
1,2008,Petrol,Hatchback,1.2,Manual,Euro 4,1,15,5905.666667,7.309881,...,0.492139,-6094.333333,0,1,Late,0,0,1,0,1.0
2,2011,Petrol,Hatchback,1.4,Manual,Euro 5,0,12,11416.666667,6.855409,...,0.951389,-583.333333,0,1,Late,0,0,1,3,0.8
3,2010,Petrol,Hatchback,1.4,Manual,Euro 4,1,13,7440.846154,7.781139,...,0.620071,-4559.153846,0,1,Late,0,0,1,2,0.866667
4,2013,Diesel,Hatchback,1.3,Manual,Euro 5,0,10,8500.0,6.907755,...,0.708333,-3500.0,0,1,Mid,0,1,0,5,0.666667


## Brand Features

In [13]:
df['Brand'].unique()


array(['Skoda', 'Vauxhall', 'Hyundai', 'Mini', 'Ford', 'Peugeot', 'Bmw',
       'Citroen', 'Mercedes', 'Mazda', 'Saab', 'Volkswagen', 'Honda',
       'Mg', 'Toyota', 'Volvo', 'Seat', 'Nissan', 'Alfa', 'Kia', 'Proton',
       'Fiat', 'Renault', 'Audi', 'Mitsubishi', 'Lexus', 'Land Rover',
       'Chevrolet', 'Suzuki', 'Dacia', 'Daihatsu', 'Jeep', 'Jaguar',
       'Chrysler', 'Rover', 'Ds', 'Daewoo', 'Dodge', 'Subaru', 'Porsche',
       'Infiniti', 'Abarth', 'Smart', 'Maserati', 'Ssangyong'],
      dtype=object)

In [14]:
df['Brand'] = df['Brand'].str.strip().str.title()

# Fix known variations
df['Brand'] = df['Brand'].replace({
    'Bmw': 'BMW',
    'Mg': 'MG',
    'Ds': 'DS',
})


In [15]:
premium = [
    'BMW',
    'Mercedes',
    'Audi',
    'Lexus',
    'Porsche',
    'Jaguar',
    'Land Rover',
    'Volvo',
    'Infiniti',
    'Maserati'
]

upper_mid = [
    'Mini',
    'Volkswagen',
    'Toyota',
    'Honda',
    'Mazda',
    'Skoda',
    'Seat',
    'Alfa',
    'Jeep',
    'Subaru'
]

mid = [
    'Ford',
    'Peugeot',
    'Renault',
    'Citroen',
    'Hyundai',
    'Kia',
    'Nissan',
    'Suzuki',
    'Fiat',
    'MG',
    'Chevrolet',
    'Mitsubishi'
]

budget = [
    'Dacia',
    'Proton',
    'Daewoo',
    'Daihatsu',
    'Ssangyong',
    'Chrysler',
    'Dodge',
    'Rover',
    'Saab',
    'Smart',
    'Abarth',
    'DS'
]

# Default assignment
df['Brand_Tier'] = 'Budget'

df.loc[df['Brand'].isin(mid), 'Brand_Tier'] = 'Mid'

df.loc[df['Brand'].isin(upper_mid), 'Brand_Tier'] = 'Upper_Mid'

df.loc[df['Brand'].isin(premium), 'Brand_Tier'] = 'Premium'


## numeric brand strength feature

In [16]:
tier_map = {
    'Budget': 0,
    'Mid': 1,
    'Upper_Mid': 2,
    'Premium': 3
}

df['Brand_Strength'] = df['Brand_Tier'].map(tier_map)


## brand depreciation resistance feature

In [17]:
df['Brand_Residual_Score'] = df['Brand_Strength'] * (1 / (df['Car_Age'] + 1))


In [18]:
df['Is_Premium'] = (df['Brand_Tier'] == 'Premium').astype(int)


In [19]:
df[['Brand', 'Brand_Tier', 'Brand_Strength']].head(20)


Unnamed: 0,Brand,Brand_Tier,Brand_Strength
0,Skoda,Upper_Mid,2
1,Vauxhall,Budget,0
2,Hyundai,Mid,1
3,Mini,Upper_Mid,2
4,Vauxhall,Budget,0
5,Hyundai,Mid,1
6,Ford,Mid,1
7,Vauxhall,Budget,0
8,Peugeot,Mid,1
9,Ford,Mid,1


## Premium durability

In [20]:
df['Age_Adjusted_Mileage'] = df['Log_Mileage'] / df['Brand_Strength'].replace(0,1)


In [21]:
df = df.drop(['Has_Service_History', 'Is_Premium', 'Is_Family_Car'], axis=1)


In [22]:
df.to_csv('Data/feature_engineered_data_revisited.csv')