In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv(r"C:\Users\Tejas Gaikwad\Downloads\cardekho.csv")

# Drop duplicates (keep missing values for imputation)
df.drop_duplicates(inplace=True)

# Extract brand name (first word of "name")
df["name"] = df["name"].apply(lambda x: x.split()[0])

# Clean numeric columns (remove text, convert to int64)
def extract_number(value):
    try:
        return int(float(str(value).split()[0]))
    except:
        return np.nan

df["mileage(km/ltr/kg)"] = df["mileage(km/ltr/kg)"].apply(extract_number)
df["engine"] = df["engine"].apply(extract_number)
df["max_power"] = df["max_power"].apply(extract_number)

# Encode categorical features
df["fuel"].replace(['Diesel', 'Petrol', 'LPG', 'CNG'], [1,2,3,4], inplace=True)
df["seller_type"].replace(['Individual', 'Dealer', 'Trustmark Dealer'], [1,2,3], inplace=True)
df["transmission"].replace(['Manual', 'Automatic'], [1,2], inplace=True)
df["owner"].replace(['First Owner', 'Second Owner', 'Third Owner',
                     'Fourth & Above Owner', 'Test Drive Car'], 
                    [1,2,3,4,5], inplace=True)

# Encode brand names
brands = df["name"].unique()
brand_mapping = {brand: idx+1 for idx, brand in enumerate(brands)}
df["name"].replace(brand_mapping, inplace=True)

# 🔹 Handle missing values
df["mileage(km/ltr/kg)"].fillna(df["mileage(km/ltr/kg)"].median(), inplace=True)
df["engine"].fillna(df["engine"].median(), inplace=True)
df["max_power"].fillna(df["max_power"].median(), inplace=True)

# Handle seats column (fill with mode)
if "seats" in df.columns:
    df["seats"].fillna(df["seats"].mode()[0], inplace=True)
    df["seats"] = df["seats"].astype("int64")

# Convert numeric columns to int64
df["mileage(km/ltr/kg)"] = df["mileage(km/ltr/kg)"].astype("int64")
df["engine"] = df["engine"].astype("int64")
df["max_power"] = df["max_power"].astype("int64")

# Check if any NaN left
print("Missing values after cleaning:\n", df.isnull().sum())

# Define input (X) and output (y)
X = df.drop(columns=["selling_price"])
y = df["selling_price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

print("Training Score:", model.score(X_train, y_train))
print("Testing Score:", model.score(X_test, y_test))
print("Sample Predictions:", y_pred[:10])
x_test = pd.DataFrame(
[[2, 2014, 120000, 1, 1, 1, 1, 12.99, 2490.0, 100.6, 8]],
columns=[
    'name','year','km_driven','fuel','seller_type','transmission','owner','mileage(km/ltr/kg)','engine','max_power','seats'])
print(x_test)
y_pred = model.predict(x_test)
print("Predicted Price :", y_pred)
import pickle as pk
pk.dump(model,open('model.pkl','wb'))

Missing values after cleaning:
 name                  0
year                  0
selling_price         0
km_driven             0
fuel                  0
seller_type           0
transmission          0
owner                 0
mileage(km/ltr/kg)    0
engine                0
max_power             0
seats                 0
dtype: int64
Training Score: 0.627043020947607
Testing Score: 0.5676735595639015
Sample Predictions: [ 355954.93748186 1199440.11492546  763064.89488523   -2851.46695456
  815692.9598647   830219.04107445  344071.9398727   118524.89612292
  228401.91084791  579600.69372968]
   name  year  km_driven  fuel  seller_type  transmission  owner  \
0     2  2014     120000     1            1             1      1   

   mileage(km/ltr/kg)  engine  max_power  seats  
0               12.99  2490.0      100.6      8  
Predicted Price : [625521.72512539]
