In [16]:
# Step 1: Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [11]:
# Step 2: Load dataset
df = pd.read_csv("../data/cars_dataset.csv")
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm


In [12]:
# Step 3: Dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Company Names              1218 non-null   object
 1   Cars Names                 1218 non-null   object
 2   Engines                    1218 non-null   object
 3   CC/Battery Capacity        1215 non-null   object
 4   HorsePower                 1218 non-null   object
 5   Total Speed                1218 non-null   object
 6   Performance(0 - 100 )KM/H  1212 non-null   object
 7   Cars Prices                1218 non-null   object
 8   Fuel Types                 1218 non-null   object
 9   Seats                      1218 non-null   object
 10  Torque                     1217 non-null   object
dtypes: object(11)
memory usage: 104.8+ KB


In [13]:
# Step 4: Dataset statistics
df.describe(include="all")

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque
count,1218,1218,1218,1215,1218,1218,1212,1218,1218,1218,1217
unique,37,1201,356,311,456,114,180,535,23,19,263
top,Nissan,Polo BlueMotion,I4,1984 cc,355 hp,250 km/h,6.5 sec,"$35,000",Petrol,5,400 Nm
freq,149,2,64,31,23,145,45,36,871,692,72


In [14]:
# Step 5: Check missing values
df.isnull().sum()

Company Names                0
Cars Names                   0
Engines                      0
CC/Battery Capacity          3
HorsePower                   0
Total Speed                  0
Performance(0 - 100 )KM/H    6
Cars Prices                  0
Fuel Types                   0
Seats                        0
Torque                       1
dtype: int64

In [None]:
# Step 6: Remove duplicates
df = df.drop_duplicates()

In [None]:
# Step 7: Clean HorsePower (remove 'hp', take average if range)
def clean_hp(value):
    if pd.isna(value): return np.nan
    value = str(value).replace("hp","").replace("HP","").strip()
    if "-" in value:  # if range like "70-85"
        parts = value.split("-")
        return np.mean([float(p) for p in parts if p.strip().isdigit()])
    value = value.replace(",","")
    return pd.to_numeric(value, errors="coerce")

df["HorsePower_clean"] = df["HorsePower"].apply(clean_hp)


In [18]:
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque,HorsePower_clean
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm,963.0
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm,563.0
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm,77.5
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm,630.0
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm,602.0


In [None]:
# Step 8: Clean Total Speed (remove 'km/h')
df["Total_Speed_clean"] = df["Total Speed"].str.replace("km/h","").str.strip()
df["Total_Speed_clean"] = pd.to_numeric(df["Total_Speed_clean"], errors="coerce")

In [20]:
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque,HorsePower_clean,Total_Speed_clean
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm,963.0,340.0
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm,563.0,250.0
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm,77.5,165.0
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm,630.0,250.0
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm,602.0,320.0


In [None]:
# Step 9: Clean Performance (remove 'sec')
df["Performance_clean"] = df["Performance(0 - 100 )KM/H"].str.replace("sec","").str.strip()
df["Performance_clean"] = pd.to_numeric(df["Performance_clean"], errors="coerce")

In [22]:
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque,HorsePower_clean,Total_Speed_clean,Performance_clean
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm,963.0,340.0,2.5
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm,563.0,250.0,5.3
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm,77.5,165.0,10.5
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm,630.0,250.0,3.2
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm,602.0,320.0,3.6


In [None]:
# Step 10: Clean Cars Prices (remove $, commas, handle ranges)
def clean_price(value):
    if pd.isna(value): return np.nan
    value = str(value).replace("$","").replace(",","").strip()
    if "-" in value:
        parts = value.split("-")
        return np.mean([float(p) for p in parts if p.replace(".","").isdigit()])
    return pd.to_numeric(value, errors="coerce")

df["Price_clean"] = df["Cars Prices"].apply(clean_price)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [24]:
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque,HorsePower_clean,Total_Speed_clean,Performance_clean,Price_clean
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm,963.0,340.0,2.5,1100000.0
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm,563.0,250.0,5.3,460000.0
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm,77.5,165.0,10.5,13500.0
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm,630.0,250.0,3.2,161000.0
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm,602.0,320.0,3.6,253290.0


In [None]:
# Step 11: Clean Fuel Types (make consistent)
df["Fuel_Type_clean"] = df["Fuel Types"].str.lower().str.strip()
df["Fuel_Type_clean"] = df["Fuel_Type_clean"].replace({
    "petrol":"Petrol",
    "plug in hyrbrid":"Hybrid",
    "diesel":"Diesel",
    "electric":"Electric",
    "hybrid":"Hybrid"
})

In [26]:
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque,HorsePower_clean,Total_Speed_clean,Performance_clean,Price_clean,Fuel_Type_clean
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm,963.0,340.0,2.5,1100000.0,Hybrid
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm,563.0,250.0,5.3,460000.0,Petrol
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm,77.5,165.0,10.5,13500.0,Petrol
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm,630.0,250.0,3.2,161000.0,Petrol
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm,602.0,320.0,3.6,253290.0,Petrol


In [27]:
# Step 12: Save cleaned dataset
df.to_csv("tables/cleaned_data.csv", index=False)