In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('car_data_cleaned.csv')

In [3]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Hyundai Verna 1.6 SX,2012.0,600000.0,100000.0,Deisel,Indvidual,Manual,First Owner
1,Datsun RediGO T Option,2017.0,250000.0,46000.0,Petorl,Indivudal,Manual,First Owner
2,Honda Amaze VX i-DTEC,2014.0,450000.0,141000.0,Diesl,Indivudal,Manual,Second Owner
3,Maruti Alto LX BSIII,2007.0,140000.0,70000.0,Petrl,Indvidual,Manual,First Owner
4,Maruti Swift Dzire VDI,2016.0,550000.0,70000.0,Petrl,Indvidual,Manual,First Owner


In [4]:
data.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

## Displaying Categorical Data

In [5]:
cat_cols = data.select_dtypes('object')
for col in cat_cols:
    print(f"{col} ({(len(data[col].unique()))}) :")
    print(data[col].unique())
    print("_" * 60)

name (1332) :
['Hyundai Verna 1.6 SX' 'Datsun RediGO T Option' 'Honda Amaze VX i-DTEC'
 ... 'Tata Nano XM' 'Mahindra Verito 1.5 D6 BSIII'
 'Hyundai i20 Magna 1.4 CRDi']
____________________________________________________________
fuel (8) :
['Deisel' 'Petorl' 'Diesl' 'Petrl' 'CNG' 'Petrrol' 'LPG' 'Electric']
____________________________________________________________
seller_type (5) :
['Indvidual' 'Indivudal' 'Deelar' 'Dealerr' 'Trustmark Dealer']
____________________________________________________________
transmission (2) :
['Manual' 'Automatic']
____________________________________________________________
owner (5) :
['First Owner' 'Second Owner' 'Fourth & Above Owner' 'Third Owner'
 'Test Drive Car']
____________________________________________________________


## Correct Categories

In [6]:
data['fuel'] = data['fuel'].replace({
    'Deisel': 'Diesel',
    'Diesl': 'Diesel',
    'Petorl': 'Petrol',
    'Petrl': 'Petrol',
    'Petrrol': 'Petrol'
})

data['seller_type'] = data['seller_type'].replace({
    'Indvidual': 'Individual',
    'Indivudal': 'Individual',
    'Deelar': 'Dealer',
    'Dealerr': 'Dealer'
})

## Extracting New Features 

### Extract Brand

In [7]:
data["brand"] = data["name"].str.split().str[0]

### Extract Model

In [8]:
data["model"] = data["name"].apply(lambda x: " ".join(x.split()[1:]))

### Is Luxury Brand

In [9]:
luxury = ['BMW','Mercedes','Audi','Jaguar','Lexus','Land','Volvo']
data["is_luxury"] = data["brand"].isin(luxury).astype(int)

### Car Age

In [10]:
data["car_age"] = 2025 - data["year"]  

### Is Old / New Car

In [11]:
data["is_old"] = (data["car_age"] > 10).astype(int)

### km_per_year
Usage level indicator:

In [12]:
data["km_per_year"] = data["km_driven"] / data["car_age"].replace(0,1)

### High / Low Usage Flag

In [13]:
data["is_high_km"] = (data["km_driven"] > data["km_driven"].median()).astype(int)

## Price per KM

In [14]:
data["price_per_km"] = data["selling_price"] / data["km_driven"].replace(0,1)

### Price per year

In [15]:
data["price_per_year"] = data["selling_price"] / data["car_age"].replace(0,1)

In [16]:
data.to_csv("cars_processed.csv", index = False)