# Car Dataset Preprocessing and Analysis

This notebook demonstrates a complete data preprocessing workflow for a car statistics dataset. It follows the structure requested: data loading, inspection, handling anomalies, filling missing values, deduplication, feature engineering, scaling, and visualizations. The notebook creates a synthetic `cars.csv` if you don't provide one, and saves a cleaned dataset at the end.

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
sns.set_theme()
%matplotlib inline


In [2]:
# Create a synthetic car dataset and save to 'cars.csv' (if you already have a file named cars.csv, you may overwrite or skip this cell)
np.random.seed(42)
n = 5000
brands = ['toyota','bmw','kia','hyundai','ford','audi','mercedes','nissan','honda','volkswagen']
models = ['model_'+str(i) for i in range(1,21)]
years = np.random.randint(1995, 2025, size=n)
mileages = np.abs((np.random.normal(loc=80000, scale=40000, size=n)).astype(int))
mileages[np.random.choice(n, 10)] = np.random.randint(500000, 2000000, size=10)
mileages[np.random.choice(n, 5)] *= -1
engine_size = np.round(np.random.choice([1.2,1.4,1.6,1.8,2.0,2.5,3.0,4.0], size=n, p=[0.08,0.12,0.2,0.15,0.2,0.12,0.08,0.05]),2)
fuel_type = np.random.choice(['Petrol','Diesel','petrol','Gasoline','electric','hybrid','diesel'], size=n, p=[0.45,0.18,0.1,0.1,0.05,0.08,0.04])
horsepower = np.round(np.random.normal(loc=150, scale=60, size=n)).astype(object)
hp_nan_idx = np.random.choice(n, 200, replace=False)
for i in hp_nan_idx:
    horsepower[i] = None
price = np.round(np.abs(np.random.normal(loc=15000, scale=10000, size=n))).astype(int)
price[np.random.choice(n, 8)] = np.random.randint(200000,1000000,size=8)
transmission = np.random.choice(['Automatic','Manual','automatic','manual'], size=n, p=[0.6,0.35,0.03,0.02])
brand = np.random.choice(brands, size=n, p=[0.15,0.1,0.1,0.1,0.1,0.08,0.07,0.1,0.08,0.12])
model = np.random.choice(models, size=n)
df_synth = pd.DataFrame({
    'brand': brand,
    'model': model,
    'year': years,
    'mileage': mileages,
    'engine_size': engine_size,
    'fuel_type': fuel_type,
    'horsepower': horsepower,
    'price': price,
    'transmission': transmission
})
df_synth.loc[np.random.choice(n,50), 'mileage'] = np.nan
df_synth = pd.concat([df_synth, df_synth.sample(20, random_state=1)], ignore_index=True)
df_synth.to_csv('/mnt/data/cars.csv', index=False)
print('Synthetic dataset saved to /mnt/data/cars.csv, shape:', df_synth.shape)

OSError: Cannot save file into a non-existent directory: '\mnt\data'

In [None]:
# Load dataset (change path if needed). If you have your own 'cars.csv', upload it to the environment and update the path.
path = '/mnt/data/cars.csv'
df = pd.read_csv(path)
print('Loaded:', path)
df.info()
df.head()

In [None]:
# Initial descriptive statistics
display(df.describe(include='all'))

In [3]:
# 1) Year bounds: keep reasonable production years
df = df[(df['year'] >= 1970) & (df['year'] <= 2025)].copy()
# 2) Mileage: remove absurdly high values > 1_000_000, and convert negative mileages to abs (demonstration)
df.loc[df['mileage'] < 0, 'mileage'] = df['mileage'].abs()
df = df[df['mileage'] < 1_000_000]
print('After basic anomaly filters, shape:', df.shape)

NameError: name 'df' is not defined

In [4]:
plt.figure(figsize=(10,4))
sns.boxplot(x=df['mileage'])
plt.title('Mileage - boxplot (after basic fixes)')
plt.show()

NameError: name 'df' is not defined

<Figure size 1000x400 with 0 Axes>

In [5]:
# Normalize categorical text fields
text_cols = ['brand','fuel_type','transmission','model']
for c in text_cols:
    df[c] = df[c].astype(str).str.lower().str.strip()

# Count and drop duplicates
print('Duplicates count before:', df.duplicated().sum())
df = df.drop_duplicates().reset_index(drop=True)
print('Duplicates count after drop:', df.duplicated().sum(), ' shape:', df.shape)

NameError: name 'df' is not defined

In [6]:
# Fill numeric NaNs by group medians where sensible
df['mileage'] = df.groupby('brand')['mileage'].apply(lambda x: x.fillna(x.median()))
df['horsepower'] = df.groupby('brand')['horsepower'].apply(lambda x: x.fillna(x.median()))
df['mileage'] = df['mileage'].fillna(df['mileage'].median())
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

print('Any NaNs left?\n', df.isna().sum())

NameError: name 'df' is not defined

In [None]:
# Feature engineering
CURRENT_YEAR = 2025
df['car_age'] = CURRENT_YEAR - df['year']
df['price_per_hp'] = df['price'] / (df['horsepower'].replace(0, np.nan))
df['price_per_hp'] = df['price_per_hp'].fillna(df['price_per_hp'].median())

df[['car_age','price_per_hp']].describe()

In [None]:
# Detect price outliers per brand (drop above 99th percentile for each brand)
def drop_brand_price_outliers(df_in):
    to_drop_idx = []
    for b, g in df_in.groupby('brand'):
        up99 = g['price'].quantile(0.99)
        mask = (g['price'] > up99)
        to_drop_idx.extend(g[mask].index.tolist())
    return df_in.drop(index=to_drop_idx)

df = drop_brand_price_outliers(df)
print('After dropping brand-wise price > 99th percentile, shape:', df.shape)

In [None]:
# Scaling numeric columns
scaler = MinMaxScaler()
num_cols = ['mileage','engine_size','horsepower','price','car_age','price_per_hp']
df[num_cols] = scaler.fit_transform(df[num_cols])
df[num_cols].describe().loc[['min','max']]

In [None]:
# Simple visualizations
plt.figure(figsize=(12,5))
sns.countplot(x='brand', data=df, order=df['brand'].value_counts().index[:15])
plt.xticks(rotation=45)
plt.title('Top brands count')
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(df['car_age'], bins=30, kde=True)
plt.title('Car age distribution')
plt.show()

plt.figure(figsize=(8,6))
sns.scatterplot(x='mileage', y='price', data=df, hue='fuel_type', alpha=0.7)
plt.title('Price vs mileage (scaled values)')
plt.show()

In [None]:
# Save cleaned dataset
clean_path = '/mnt/data/cars_cleaned.csv'
df.to_csv(clean_path, index=False)
print('Cleaned dataset saved to', clean_path)
df.shape

## Next steps

- Use this cleaned dataset for clustering (KMeans), regression (price prediction), or classification tasks.
- You can upload your real `cars.csv` and re-run the notebook: replace `/mnt/data/cars.csv` with your uploaded path.

---
Notebook generated automatically.