In [2]:
# Import
import pandas as pd

# Load data
df = pd.read_csv("vehicles.csv")
df = df[(df["price"] > 100) & (df["price"] < 100000)]

# Create car_age
df["car_age"] = 2025 - df["year"]
df = df.drop(columns=["year"], errors="ignore")

# Simplify model
top_models = df["model"].value_counts().nlargest(20).index
df["model"] = df["model"].where(df["model"].isin(top_models), other="other")

# Encode categorical
cat_cols = ["manufacturer", "condition", "fuel", "title_status", "transmission", "drive", "type", "state", "model"]
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Drop text columns
df = df.drop(columns=[
    "url", "region", "region_url", "VIN", "description", "image_url", "posting_date", "paint_color", "cylinders", "size"
], errors="ignore")
df = df.drop(columns=df.select_dtypes(include="object").columns, errors="ignore")

# Handle nulls
df = df.fillna(0)

# Derived features
df["mileage_per_year"] = df["odometer"] / df["car_age"].replace(0, 1)
df["is_old_car"] = (df["car_age"] > 10).astype(int)
df["is_high_mileage"] = (df["odometer"] > 150000).astype(int)
