In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

import tensorflow as tf

# -----------------------------
# 1) Load data
# -----------------------------
path = "/mnt/data/Car details v3.csv"
df = pd.read_csv(path)

# -----------------------------
# 2) Keep only required columns
# -----------------------------
target_col = "selling_price"

feature_cols = [
    "year",
    "km_driven",
    "seller_type",
    "transmission",
    "owner",
    "mileage",
    "engine",
    "max_power",
    "seats",
]

df = df[feature_cols + [target_col]].copy()

# -----------------------------
# 3) Clean numeric-with-units columns
#    mileage: "23.4 kmpl" / "18.0 km/kg"
#    engine: "1248 CC"
#    max_power: "74 bhp"
# -----------------------------
def extract_number(x):
    if pd.isna(x):
        return np.nan
    s = str(x)
    # Extract first float-like number
    import re
    m = re.search(r"(\d+(\.\d+)?)", s)
    return float(m.group(1)) if m else np.nan

df["mileage"] = df["mileage"].apply(extract_number)
df["engine"] = df["engine"].apply(extract_number)
df["max_power"] = df["max_power"].apply(extract_number)

# Ensure numeric types where appropriate
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df["km_driven"] = pd.to_numeric(df["km_driven"], errors="coerce")
df["seats"] = pd.to_numeric(df["seats"], errors="coerce")
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")

# Drop missing rows in required fields (simple + robust for assignment)
df = df.dropna(subset=feature_cols + [target_col]).reset_index(drop=True)

# -----------------------------
# 4) Train/Test split (random 30% test)
# -----------------------------
train_df, test_df = train_test_split(df, test_size=0.30, random_state=42, shuffle=True)

X_train = train_df[feature_cols]
y_train = train_df[target_col].values

X_test = test_df[feature_cols]
y_test = test_df[target_col].values

# -----------------------------
# 5) Preprocess:
#    - OneHotEncode categoricals
#    - Standardize numeric features
#    (Fit on TRAIN only, transform both)
# -----------------------------
cat_cols = ["seller_type", "transmission", "owner"]
num_cols = ["year", "km_driven", "mileage", "engine", "max_power", "seats"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
    ],
    remainder="drop"
)

# Fit on train, transform train/test
X_train_p = preprocess.fit_transform(X_train)
X_test_p = preprocess.transform(X_test)

# -----------------------------
# 6) Build Neural Network (regression)
# -----------------------------
tf.random.set_seed(42)

nn_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train_p.shape[1],)),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1)
])

nn_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="mse",
    metrics=["mae"]
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=30,
    restore_best_weights=True
)

history = nn_model.fit(
    X_train_p, y_train,
    validation_split=0.2,
    shuffle=True,
    epochs=2000,
    batch_size=32,
    callbacks=[early_stop],
    #verbose=0
)

# -----------------------------
# 7) Predict + MAE (train/test)
# -----------------------------
yhat_train = nn_model.predict(X_train_p, verbose=0).ravel()
yhat_test = nn_model.predict(X_test_p, verbose=0).ravel()

mae_train = mean_absolute_error(y_train, yhat_train)
mae_test = mean_absolute_error(y_test, yhat_test)

print(f"Train MAE: {mae_train:,.0f} INR")
print(f"Test  MAE: {mae_test:,.0f} INR")

# -----------------------------
# 8) Plot: predicted vs actual on TEST
# -----------------------------
plt.figure(figsize=(8, 8))
plt.scatter(y_test, yhat_test, alpha=0.6)
# y=x reference line
min_v = min(y_test.min(), yhat_test.min())
max_v = max(y_test.max(), yhat_test.max())
plt.plot([min_v, max_v], [min_v, max_v], linewidth=2)

plt.title(f"Test set: Actual vs Predicted Selling Price\nMAE = {mae_test:,.0f} INR")
plt.xlabel("Actual selling_price (INR)")
plt.ylabel("Predicted selling_price (INR)")
plt.tight_layout()
plt.show()