## Assignment Day 27

In [1]:
# Import dependencies
import os
import joblib
import pandas as pd
import numpy as np

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

pd.set_option("display.max_columns",None)

In [2]:
# Read the data
def load_data(path:str) -> pd.DataFrame:
    """Load and open our data

    Args:
        path (str): path to our data

    Returns:
        pd.DataFrame: DataFrame
    """
    df = pd.read_csv(path)
    
    return df

In [3]:
df = load_data('../artifacts/car.csv')
df

Unnamed: 0.1,Unnamed: 0,name,year,Price,kms_driven,fuel_type,company
0,0,Hyundai Santro Xing,2007,80000,45000,Petrol,Hyundai
1,1,Mahindra Jeep CL550,2006,425000,40,Diesel,Mahindra
2,2,Hyundai Grand i10,2014,325000,28000,Petrol,Hyundai
3,3,Ford EcoSport Titanium,2014,575000,36000,Diesel,Ford
4,4,Ford Figo,2012,175000,41000,Diesel,Ford
...,...,...,...,...,...,...,...
811,811,Maruti Suzuki Ritz,2011,270000,50000,Petrol,Maruti
812,812,Tata Indica V2,2009,110000,30000,Diesel,Tata
813,813,Toyota Corolla Altis,2009,300000,132000,Petrol,Toyota
814,814,Tata Zest XM,2018,260000,27000,Diesel,Tata


In [4]:
def load_data(path:str) -> pd.DataFrame:
    df = pd.read_csv(path)
    
    return df

def dataPreparation(df: pd.DataFrame, target: str = 'Price', drop: str = 'Unnamed: 0'):
    # Drop unnecessary column
    df = df.drop(columns=[drop], axis=1)
    
    # Remove outliers using IQR
    Q1 = df[target].quantile(0.25)
    Q3 = df[target].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]
    
    # Log transform target to handle skewness
    df[target] = np.log1p(df[target])
    
    # Create new features
    df['car_age'] = 2025 - df['year']
    
    # Drop original 'name' column (redundant after extraction)
    df = df.drop(columns=['year'], axis=1)
    
    # Split our data into X and y
    X = df.drop(columns=[target], axis=1)
    y = df[target]
    print(X.columns)
    return X, y

def processingPipeline(X: pd.DataFrame, y: pd.Series):
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    scaler = StandardScaler()
    
    categorical_cols = ['company', 'fuel_type', 'name']
    numeric_cols = ['kms_driven', 'car_age']
    
    column_trans = ColumnTransformer(
    transformers=[
        ('ohe', ohe, categorical_cols),  # Apply OneHotEncoder to categorical columns
        ('scaler', scaler, numeric_cols)  # Apply StandardScaler to numeric columns
    ]
)

    # Create the pipeline
    pipe = Pipeline(steps=[
        ('preprocessor', column_trans),  # Preprocessing step
        ('model', LinearRegression())   # Model step
    ])
    
    # Split our data into X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, pipe

def predict(X_train, X_test, y_train, y_test, pipe):
    # Fit the pipeline
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    # Revert log transformation for evaluation
    y_test_orig = np.expm1(y_test)
    y_pred_orig = np.expm1(y_pred)
    
    print(f"R2 Test -> {r2_score(y_test_orig, y_pred_orig)}")
    print(f"MAE Test -> {mean_absolute_error(y_test_orig, y_pred_orig)}")

In [5]:
df = load_data('../artifacts/car.csv')
X, y = dataPreparation(df)
X_train, X_test, y_train, y_test, pipe = processingPipeline(X, y)
predict(X_train, X_test, y_train, y_test, pipe)


Index(['name', 'kms_driven', 'fuel_type', 'company', 'car_age'], dtype='object')
R2 Test -> 0.7882185331665715
MAE Test -> 53232.360831496895


In [17]:
result = pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['Mahindra Quanto C8', 100000,'Petrol','Maruti', 10]).reshape(1,5)))
np.expm1(result)

array([239316.9927879])

In [20]:
model_path = os.path.join('../artifacts', 'best_model.joblib')
joblib.dump(pipe, open(model_path, 'wb'))

print(f"Model saved to {model_path}")

Model saved to ../artifacts\best_model.joblib


In [7]:
import json

pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)
        
# Calculate metrics
metrics = {
    'train_r2': float(r2_score(y_train, pred_train)),
    'test_r2': float(r2_score(y_test, pred_test)),
    'train_rmse': float(np.sqrt(mean_squared_error(np.exp(y_train), np.exp(pred_train)))),
    'test_rmse': float(np.sqrt(mean_squared_error(np.exp(y_test), np.exp(pred_test)))),
    'train_mae': float(mean_absolute_error(np.exp(y_train), np.exp(pred_train))),
    'test_mae': float(mean_absolute_error(np.exp(y_test), np.exp(pred_test)))
}

with open('../artifacts/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=4)

In [12]:
result = pd.concat([X, np.expm1(y)], axis=1)
result.to_csv('../artifacts/new_car.csv', index=False)