## Random forest Regression Implementation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv('cardekho_imputated.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,357003.9,465401.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,854082.9,1307926.0,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,357003.9,465401.5,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


## Data Cleaning

### Handling Missing Values

In [5]:
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
min_cost_price       0
max_cost_price       0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [6]:
df.drop('car_name',axis=1,inplace=True)
df.drop('brand',axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Alto,357003.9,465401.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Grand,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,i20,854082.9,1307926.0,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Alto,357003.9,465401.5,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ecosport,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [8]:
df['model'].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [9]:
num_feature=[feature for feature in df.columns if df[feature].dtype!='O']
print('Numerical Featuere:',len(num_feature))
cat_feature=[feature for feature in df.columns if df[feature].dtype=='O']
print('Categorical Featuere:',len(cat_feature))
disc_feature=[feature for feature in df.columns if len(df[feature].unique())<=25]
print('Discrete  Featuere:',len(disc_feature))
con_feature=[feature for feature in df.columns if feature not in disc_feature]
print('Continous  Featuere:',len(con_feature))

Numerical Featuere: 10
Categorical Featuere: 4
Discrete  Featuere: 5
Continous  Featuere: 9


In [10]:
# Independent and dependent features
from sklearn.model_selection import train_test_split
X=df.drop(['selling_price'],axis=1)
y=df['selling_price']

In [11]:
X.head()

Unnamed: 0.1,Unnamed: 0,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,0,Alto,357003.9,465401.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,1,Grand,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,2,i20,854082.9,1307926.0,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,3,Alto,357003.9,465401.5,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,4,Ecosport,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [12]:
len(df['model'].unique())

120

In [13]:
## FEATURE Encoding And Scaling
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X['model']=le.fit_transform(X['model'])

In [14]:
df['model'].value_counts()

model
i20             906
Swift Dzire     890
Swift           781
Alto            778
City            757
               ... 
Altroz            1
C                 1
Ghost             1
Quattroporte      1
Gurkha            1
Name: count, Length: 120, dtype: int64

In [15]:
X.head()

Unnamed: 0.1,Unnamed: 0,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,0,7,357003.9,465401.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,1,54,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,2,118,854082.9,1307926.0,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,3,7,357003.9,465401.5,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,4,38,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [16]:
len(df['seller_type'].unique()),len(df['transmission_type'].unique()),len(df['seller_type'].unique()),len(df['fuel_type'].unique())

(3, 2, 3, 5)

In [17]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Get column names safely
num_feature = X.select_dtypes(exclude="object").columns.tolist()
onehot_columns = ['seller_type', 'fuel_type', 'transission_type']

# Ensure onehot_columns exist in X
onehot_columns = [col for col in onehot_columns if col in X.columns]

# Define transformers
numeric_transformation = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

# Create column transformer
preprocessor = ColumnTransformer(
    [
        ("OnehotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformation, num_feature)
    ], 
    remainder='passthrough'
)

# Apply transformation
X_transformed = preprocessor.fit_transform(X)


In [18]:
X=preprocessor.fit_transform(X)

In [19]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape

((12328, 17), (3083, 17))

In [20]:
X_train

array([[0.0, 0.0, 1.0, ..., 2.6624977092652693, -0.4030224142720608,
        'Automatic'],
       [1.0, 0.0, 0.0, ..., -0.38602843979097634, -0.4030224142720608,
        'Manual'],
       [0.0, 0.0, 1.0, ..., 3.274530058274271, -0.4030224142720608,
        'Automatic'],
       ...,
       [1.0, 0.0, 0.0, ..., -0.7807078557298611, -0.4030224142720608,
        'Manual'],
       [0.0, 0.0, 0.0, ..., -0.43582879062288754, -0.4030224142720608,
        'Manual'],
       [1.0, 0.0, 0.0, ..., 0.061942005776449066, -0.4030224142720608,
        'Automatic']], shape=(12328, 17), dtype=object)

## MOdel Trainig And Model Selection

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score,classification_report,precision_score,recall_score,f1_score,confusion_matrix,roc_auc_score,roc_curve,mean_absolute_error,mean_squared_error,r2_score

In [22]:
## Create a Function To Evulate Model
def evualaet_model(true,predicted):
    mae=mean_squared_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_score=r2_score(true,predicted)
    return mae,rmse,r2_score

In [25]:

import pandas as pd

# Get feature names after transformation
feature_names = (
    preprocessor.get_feature_names_out()
    if hasattr(preprocessor, "get_feature_names_out")
    else None
)

# Apply transformation
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convert NumPy array to DataFrame with column names
X_train = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test = pd.DataFrame(X_test_transformed, columns=feature_names)

# Check the data types
print(X_train.dtypes)

ValueError: input_features is not equal to feature_names_in_

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,
    mean_absolute_error, mean_squared_error, r2_score
)

# Define models (separate classification and regression)
models = {
    "Linear Regression": LinearRegression(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Nearest Regressor": KNeighborsRegressor(),
}

# Loop through models
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f"\n{name} Performance:")

    # Classification models
    if name in ["Logistic Regression", "Random Forest", "Decision Tree"]:
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred, average='weighted')
        train_precision = precision_score(y_train, y_train_pred, average='weighted')
        train_recall = recall_score(y_train, y_train_pred, average='weighted')
        train_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]) if hasattr(model, "predict_proba") else None

        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_f1 = f1_score(y_test, y_test_pred, average='weighted')
        test_precision = precision_score(y_test, y_test_pred, average='weighted')
        test_recall = recall_score(y_test, y_test_pred, average='weighted')
        test_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, "predict_proba") else None

        print(f"Training Accuracy: {train_accuracy:.4f}")
        print(f"Training F1 Score: {train_f1:.4f}")
        print(f"Training Precision: {train_precision:.4f}")
        print(f"Training Recall: {train_recall:.4f}")
        print(f"Training ROC AUC Score: {train_auc:.4f}" if train_auc else "ROC AUC not available")

        print("\nTest Performance:")
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Test F1 Score: {test_f1:.4f}")
        print(f"Test Precision: {test_precision:.4f}")
        print(f"Test Recall: {test_recall:.4f}")
        print(f"Test ROC AUC Score: {test_auc:.4f}" if test_auc else "ROC AUC not available")

    # Regression models
    else:
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
        train_r2 = r2_score(y_train, y_train_pred)

        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
        test_r2 = r2_score(y_test, y_test_pred)

        print(f"Training MAE: {train_mae:.4f}")
        print(f"Training RMSE: {train_rmse:.4f}")
        print(f"Training R2 Score: {train_r2:.4f}")

        print("\nTest Performance:")
        print(f"Test MAE: {test_mae:.4f}")
        print(f"Test RMSE: {test_rmse:.4f}")
        print(f"Test R2 Score: {test_r2:.4f}")

    print("-" * 50)



Training Linear Regression...


ValueError: could not convert string to float: 'Automatic'