In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
# Load and clean the dataset
data = pd.read_csv("train1.csv")

# Drop irrelevant columns and rows with missing target values
data.drop(columns=['id', 'brand', 'model', 'engine'], inplace=True)
data.dropna(subset=['price'], inplace=True)

In [5]:
# Apply log transformation to 'price' to reduce skewness
data['price'] = np.log1p(data['price'])

# Create a new feature for vehicle age and drop 'model_year'
data['vehicle_age'] = 2024 - data['model_year']
data.drop(columns=['model_year'], inplace=True)

In [6]:
# Remove outliers in 'price' using the IQR method
Q1, Q3 = data['price'].quantile([0.25, 0.75])
IQR = Q3 - Q1
lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]

In [7]:
# Split features and target
X, y = data.drop(columns=['price']), data['price']

# Identify categorical columns (low cardinality) and numerical columns
categorical_cols = [col for col in X.select_dtypes(include=['object']).columns if X[col].nunique() <= 3]
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [8]:
# Preprocessing pipeline for numerical features (impute missing values, scale)
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

# Preprocessing pipeline for categorical features (impute and one-hot encode)
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
# Combine both preprocessing pipelines
preprocessor = ColumnTransformer([
    ('num', num_transformer, numerical_cols),
    ('cat', cat_transformer, categorical_cols)
])

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Define the XGBoost model
xgboost = XGBRegressor(objective='reg:squarederror', random_state=42)

# Create a complete pipeline (preprocessing + model)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgboost)
])

# Hyperparameter grid for XGBoost tuning
param_grid = {
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__n_estimators': [100, 200, 300],
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.8, 0.9, 1.0]
}


In [12]:
# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Display the best parameters
print("Best Parameters:", grid_search.best_params_)


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Parameters: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 5, 'regressor__n_estimators': 300, 'regressor__subsample': 1.0}


In [13]:
# Evaluate the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate metrics on log-transformed scale
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Metrics on log-transformed scale:\n  RMSE: {rmse:.2f}\n  R2: {r2:.2f}")

# Convert predictions and actual values back to the original scale
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred)

# Calculate final metrics on the original scale
final_mse = mean_squared_error(y_test_actual, y_pred_actual)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y_test_actual, y_pred_actual)

print(f"Metrics on original scale:\n  RMSE: {final_rmse:.2f}\n  R2: {final_r2:.2f}")


Metrics on log-transformed scale:
  RMSE: 0.48
  R2: 0.62
Metrics on original scale:
  RMSE: 26076.45
  R2: 0.38
