# --- 1. Load Data --- 

In [2]:
import pandas as pd
import numpy as np

# Ensure all 4 CSV files are in the same directory as your notebook
df1 = pd.read_csv("datasets\car data.csv")
df2 = pd.read_csv("datasets\CAR DETAILS FROM CAR DEKHO.csv")
df3 = pd.read_csv("datasets\Car details v3.csv")
df4 = pd.read_csv("datasets\car details v4.csv")

# --- 2. Data Standardization and Initial Cleaning ---

In [4]:
# 1. df1 (car data.csv) - Rename columns and convert price (Lakhs to Rupees)
df1 = df1.rename(columns={
    'Car_Name': 'name', 'Selling_Price': 'selling_price', 'Kms_Driven': 'km_driven',
    'Fuel_Type': 'fuel', 'Seller_Type': 'seller_type', 'Transmission': 'transmission',
    'Owner': 'owner', 'Year': 'year'
})
df1['selling_price'] = df1['selling_price'] * 100000
owner_map_df1 = {0: 'First Owner', 1: 'Second Owner', 2: 'Third Owner', 3: 'Fourth & Above Owner'}
df1['owner'] = df1['owner'].replace(owner_map_df1)
df1_clean = df1[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]

In [5]:
# 2. df2 and 3 (already in good format)
df2_clean = df2[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]
df3_clean = df3[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]

In [6]:
# 4. df4 (car details v4.csv) - Rename columns and create combined 'name'
df4 = df4.rename(columns={
    'Price': 'selling_price', 'Year': 'year', 'Kilometer': 'km_driven',
    'Fuel Type': 'fuel', 'Seller Type': 'seller_type', 'Owner': 'owner',
    'Transmission': 'transmission'
})
df4['name'] = df4['Make'] + ' ' + df4['Model']
df4_clean = df4[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]

# Concatenate all four cleaned DataFrames
master_df = pd.concat([df1_clean, df2_clean, df3_clean, df4_clean], ignore_index=True)

# --- 3. Core Feature Engineering (Age, Brand, Vehicle Type) ---

In [None]:
current_year = 2024
master_df['age'] = current_year - master_df['year']
master_df.drop('year', axis=1, inplace=True)
master_df['brand'] = master_df['name'].apply(lambda x: x.split(' ')[0].strip())

# Separate Car vs. Bike (using common Indian bike brands)
bike_brands = ['Royal Enfield', 'Bajaj', 'TVS', 'Hero', 'Yamaha', 'Honda', 'KTM', 'Suzuki', 'Harley-Davidson', 'Kawasaki', 'Mahindra', 'Vespa', 'Jawa']
master_df['vehicle_type'] = master_df['brand'].apply(lambda x: 'Bike' if x in bike_brands else 'Car')

# Separate the two models
df_cars = master_df[master_df['vehicle_type'] == 'Car'].copy()
df_bikes = master_df[master_df['vehicle_type'] == 'Bike'].copy()

# --- 4. Feature Refinement: Owner Score and KM Bins (New Predictive Features) ---

In [8]:
# A. Owner Score Mapping
owner_mapping = {
    'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3,
    'Fourth & Above Owner': 4, 'Test Drive Car': 0
}
df_cars['owner_score'] = df_cars['owner'].map(owner_mapping)
df_bikes['owner_score'] = df_bikes['owner'].map(owner_mapping)
df_cars.drop('owner', axis=1, inplace=True)
df_bikes.drop('owner', axis=1, inplace=True)


# B. Odometer Bins
# Define bins based on typical depreciation stages
km_bins = [0, 20000, 60000, 120000, np.inf]
km_labels = ['Low_Mileage', 'Medium_Mileage', 'High_Mileage', 'Very_High_Mileage']
df_cars['km_category'] = pd.cut(master_df['km_driven'], bins=km_bins, labels=km_labels, right=False)
df_bikes['km_category'] = pd.cut(master_df['km_driven'], bins=km_bins, labels=km_labels, right=False)

# Drop the original continuous 'km_driven' column (it still exists in master_df, but not df_cars)
# We can't drop 'km_driven' from df_cars because we removed it from master_df and then filtered, so km_driven is not there.
# Re-adding the original km_driven before binning:
df_cars['km_driven'] = master_df.loc[df_cars.index, 'km_driven']
df_bikes['km_driven'] = master_df.loc[df_bikes.index, 'km_driven']

df_cars['km_category'] = pd.cut(df_cars['km_driven'], bins=km_bins, labels=km_labels, right=False)
df_bikes['km_category'] = pd.cut(df_bikes['km_driven'], bins=km_bins, labels=km_labels, right=False)

df_cars.drop('km_driven', axis=1, inplace=True)
df_bikes.drop('km_driven', axis=1, inplace=True)

In [9]:
# --- 5. Final Dataframes (df_cars and df_bikes are ready for Phase 1, Part B) ---
print("Car Data (Ready for Modeling):")
print(df_cars.info())
print("\nBike Data (Ready for Modeling):")
print(df_bikes.info())

Car Data (Ready for Modeling):
<class 'pandas.core.frame.DataFrame'>
Index: 12616 entries, 0 to 14826
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   name           12616 non-null  object  
 1   selling_price  12616 non-null  float64 
 2   fuel           12616 non-null  object  
 3   seller_type    12616 non-null  object  
 4   transmission   12616 non-null  object  
 5   age            12616 non-null  int64   
 6   brand          12616 non-null  object  
 7   vehicle_type   12616 non-null  object  
 8   owner_score    10834 non-null  float64 
 9   km_category    12616 non-null  category
dtypes: category(1), float64(2), int64(1), object(6)
memory usage: 998.1+ KB
None

Bike Data (Ready for Modeling):
<class 'pandas.core.frame.DataFrame'>
Index: 2212 entries, 102 to 14827
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   name      

In [14]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import json
import os
import warnings

warnings.filterwarnings("ignore")

# --- 0. FILE STRUCTURE SETUP ---
MODELS_DIR = "models"
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)
    print(f"Created directory: {MODELS_DIR}")

# --- 1. FULL DATA PREPARATION ---
print("\n--- 1. Loading and Cleaning Data ---")
# NOTE: Using the file names directly as provided by the user, assuming they are in the current working directory or subfolders as needed.
# For the environment here, I'll use the file names accessible to the environment.
df1 = pd.read_csv("datasets\car data.csv")
df2 = pd.read_csv("datasets\CAR DETAILS FROM CAR DEKHO.csv")
df3 = pd.read_csv("datasets\Car details v3.csv")
df4 = pd.read_csv("datasets\car details v4.csv")

# Standardize df1
df1 = df1.rename(columns={'Car_Name': 'name', 'Selling_Price': 'selling_price', 'Kms_Driven': 'km_driven', 'Fuel_Type': 'fuel', 'Seller_Type': 'seller_type', 'Transmission': 'transmission', 'Owner': 'owner', 'Year': 'year'})
df1['selling_price'] *= 100000
owner_map_df1 = {0: 'First Owner', 1: 'Second Owner', 2: 'Third Owner', 3: 'Fourth & Above Owner'}
df1['owner'] = df1['owner'].replace(owner_map_df1)
df1_clean = df1[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]

# Standardize df4
df4 = df4.rename(columns={'Price': 'selling_price', 'Year': 'year', 'Kilometer': 'km_driven', 'Fuel Type': 'fuel', 'Seller Type': 'seller_type', 'Owner': 'owner', 'Transmission': 'transmission'})
df4['name'] = df4['Make'] + ' ' + df4['Model']
df4_clean = df4[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]

# Concatenate all four
master_df = pd.concat([df1_clean, df2[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']], df3[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']], df4_clean], ignore_index=True)

# Feature Engineering
current_year = 2024
master_df['age'] = current_year - master_df['year']
master_df.drop('year', axis=1, inplace=True)
master_df['brand'] = master_df['name'].apply(lambda x: x.split(' ')[0].strip())
bike_brands = ['Royal Enfield', 'Bajaj', 'TVS', 'Hero', 'Yamaha', 'Honda', 'KTM', 'Suzuki', 'Harley-Davidson', 'Kawasaki', 'Mahindra', 'Vespa', 'Jawa']
master_df['vehicle_type'] = master_df['brand'].apply(lambda x: 'Bike' if x in bike_brands else 'Car')

# Refinement: Owner Score and KM Bins
owner_mapping = {'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3, 'Fourth & Above Owner': 4, 'Test Drive Car': 0}
master_df['owner_score'] = master_df['owner'].map(owner_mapping)
master_df.drop('owner', axis=1, inplace=True)

# CORRECT LINE 1: Capture max km value BEFORE dropping the column
max_km_value = int(master_df['km_driven'].max())

km_bins = [0, 20000, 60000, 120000, np.inf]
km_labels = ['Low_Mileage', 'Medium_Mileage', 'High_Mileage', 'Very_High_Mileage']
master_df['km_category'] = pd.cut(master_df['km_driven'], bins=km_bins, labels=km_labels, right=False)
master_df.drop('km_driven', axis=1, inplace=True)

# Final Separation
df_cars = master_df[master_df['vehicle_type'] == 'Car'].copy()
df_bikes = master_df[master_df['vehicle_type'] == 'Bike'].copy()

# --- 2. CAR MODEL TRAINING & ARTIFACTS ---
print("--- 2. Training Car Model and Saving Artifacts ---")
X_car = df_cars.drop(['selling_price', 'name', 'vehicle_type'], axis=1)
y_car_log = np.log1p(df_cars['selling_price'])
X_car['owner_score'].fillna(X_car['owner_score'].mode()[0], inplace=True)
categorical_features = ['brand', 'fuel', 'seller_type', 'transmission', 'km_category']
X_car_encoded = pd.get_dummies(X_car, columns=categorical_features, drop_first=True)
X_train_car, _, y_train_log_car, _ = train_test_split(X_car_encoded, y_car_log, test_size=0.2, random_state=42)
feature_names_car = X_car_encoded.columns.tolist()

best_params_car = {'n_estimators': 500, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
rf_best_car = RandomForestRegressor(**best_params_car, random_state=42, n_jobs=-1)
rf_best_car.fit(X_train_car, y_train_log_car)
joblib.dump(rf_best_car, f'{MODELS_DIR}/rf_best_car.pkl')

# --- 3. BIKE MODEL TRAINING & ARTIFACTS ---
print("--- 3. Training Bike Model and Saving Artifacts ---")
X_bike = df_bikes.drop(['selling_price', 'name', 'vehicle_type'], axis=1)
y_bike_log = np.log1p(df_bikes['selling_price'])
X_bike['owner_score'].fillna(X_bike['owner_score'].mode()[0], inplace=True)
X_bike_encoded = pd.get_dummies(X_bike, columns=categorical_features, drop_first=True)
X_train_bike, _, y_train_log_bike, _ = train_test_split(X_bike_encoded, y_bike_log, test_size=0.2, random_state=42)
feature_names_bike = X_bike_encoded.columns.tolist()

best_params_bike = {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 5}
rf_best_bike = RandomForestRegressor(**best_params_bike, random_state=42, n_jobs=-1)
rf_best_bike.fit(X_train_bike, y_train_log_bike)
joblib.dump(rf_best_bike, f'{MODELS_DIR}/rf_best_bike.pkl')

# --- 4. UI METADATA ARTIFACT GENERATION (FIXED) ---

car_brands_list = sorted(df_cars['brand'].unique().tolist())
bike_brands_list = sorted(df_bikes['brand'].unique().tolist())

min_age_py = int(master_df['age'].min())
max_age_py = int(master_df['age'].max())
# CORRECT LINE 2: Use the pre-calculated variable, not the dropped column
max_km_py = max_km_value 

ui_metadata = {
    'car_brands': car_brands_list,
    'bike_brands': bike_brands_list,
    'fuel': sorted(master_df['fuel'].unique().tolist()),
    'transmission': sorted(master_df['transmission'].unique().tolist()),
    'seller_type': sorted(master_df['seller_type'].unique().tolist()),
    'owner_type': sorted(master_df['owner_score'].map({1: 'First Owner', 2: 'Second Owner', 3: 'Third Owner', 4: 'Fourth & Above Owner', 0: 'Test Drive Car'}).dropna().unique().tolist()),
    'min_age': min_age_py,
    'max_age': max_age_py,
    'max_km': max_km_py,
    'car_feature_cols': feature_names_car,
    'bike_feature_cols': feature_names_bike
}

# Save UI Metadata as JSON
with open('ui_metadata.json', 'w') as f:
    json.dump(ui_metadata, f, indent=4)
    
print("\n✅ All necessary files have been created in the correct directories.")
print("   - models/rf_best_car.pkl")
print("   - models/rf_best_bike.pkl")
print("   - ui_metadata.json")


--- 1. Loading and Cleaning Data ---
--- 2. Training Car Model and Saving Artifacts ---
--- 3. Training Bike Model and Saving Artifacts ---

✅ All necessary files have been created in the correct directories.
   - models/rf_best_car.pkl
   - models/rf_best_bike.pkl
   - ui_metadata.json
