In [None]:
import numpy as np
import pandas as pd

In [None]:
loaded_data = pd.read_csv('../data/topic21_v23_train.csv')

loaded_data.info()

In [None]:
def remove_outliers(df, threshold=2.5):
    df_clean = df.copy()
    initial_rows = len(df)
    numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns

    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    removed_rows = initial_rows - len(df_clean)
    print(f"Removed {removed_rows} rows ({(removed_rows / initial_rows) * 100:.2f}% of data)")

    return df_clean


In [None]:
from sklearn.model_selection import train_test_split

loaded_data = loaded_data.dropna(subset=['0', '1', '2', '3', '4'])
# loaded_data = loaded_data.dropna()
loaded_data = remove_outliers(loaded_data)

loaded_data.info()

In [None]:
df = loaded_data.copy()

def extract_range_mean(val):
    try:
        nums = [int(s) for s in val.replace('cc','').replace('HP','').split('-')]
        return np.mean(nums)
    except:
        return np.nan

df['engine_capacity'] = df['engine_capacity_cc'].apply(extract_range_mean)
df['horsepower_val'] = df['horsepower'].apply(extract_range_mean)
df.drop(columns=['engine_capacity_cc', 'horsepower'], inplace=True)

for col in ['brand', 'model', 'trim']:
    means = df.groupby(col)['price'].mean()
    df[col + '_enc'] = df[col].map(means)

df.drop(columns=['brand', 'model', 'trim'], inplace=True)

df.info()

In [None]:
X = df.drop(columns=['price'])
y = df['price']

# X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'X_train shape = {X_train.shape}')

In [None]:
# y_train = np.log1p(y_train)

In [None]:
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train_transformed = pipeline.fit_transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

models = {
    "SVR": SVR(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    "XGBoost": XGBRegressor(verbosity=0, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),

    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),

    "GradientBoosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "KNeighbors": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=1000),
    "LightGBM": LGBMRegressor()
}

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


def train_and_evaluate(models, X_train, y_train, X_valid, y_valid):
    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)

        # predictions = np.expm1(predictions)

        mse = mean_squared_error(y_valid, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_valid, predictions)
        r2 = r2_score(y_valid, predictions)

        results[name] = {
            "MSE": mse,
            "RMSE": rmse,
            "MAE": mae,
            "R2": r2
        }
    return results


In [None]:
results = train_and_evaluate(models, X_train_transformed, y_train, X_valid_transformed, y_valid)
results_df = pd.DataFrame(results).T.sort_values(by="RMSE")
print(results_df)

                       # MSE           RMSE           MAE        R2
# CatBoost          3.326862e+09   57678.957622  36007.046475  0.758468
# ExtraTrees        3.819171e+09   61799.445123  35492.232677  0.722726
# XGBoost           3.909158e+09   62523.258776  38381.046875  0.716193
# RandomForest      4.318880e+09   65718.189933  40911.319177  0.686447
# LightGBM          4.398064e+09   66317.904008  41307.707782  0.680698
# Ridge             4.408645e+09   66397.624660  42431.725407  0.679930
# KNeighbors        4.446898e+09   66685.066571  39663.771807  0.677153
# Lasso             5.004806e+09   70744.652976  43656.288867  0.636648
# LinearRegression  5.177776e+09   71956.763602  44429.458294  0.624090
# GradientBoosting  5.320518e+09   72941.880798  49072.349349  0.613727
# MLP               7.663219e+09   87539.812380  60137.373434  0.443646
# DecisionTree      8.649014e+09   93000.074119  52714.455061  0.372077
# ElasticNet        9.665076e+09   98311.118872  70419.732800  0.298310
# AdaBoost          1.062364e+10  103071.064253  77921.810713  0.228717
# SVR               1.559393e+10  124875.649262  84389.552462 -0.132128


In [None]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "XGBoost": XGBRegressor(verbosity=0, random_state=42)
}

param_grids = {
    "RandomForest": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]
    },
    "ExtraTrees": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]
    },
    "CatBoost": {
        'pca__n_components': [5, 10, 15, 19],
        'model__iterations': [500, 1000],
        'model__depth': [4, 6, 8, 10],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'model__l2_leaf_reg': [1, 3, 5, 7, 9],
        'model__bagging_temperature': [0, 0.5, 1]
    },
    "XGBoost": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [3, 5, 7, 10],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.6, 0.8, 1.0],
        'model__gamma': [0, 0.1, 0.3, 0.5],
        'model__reg_alpha': [0, 0.1, 1],
        'model__reg_lambda': [1, 1.5, 2]
    }
}


In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV

preds = {}
for name, model in models.items():
    print(f"\n GridSearch for {name}...")

    pipeline = Pipeline([
        ('pca', PCA()),
        ('model', model)
    ])

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[name],
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=2,
        n_jobs=-1
    )

    grid_search.fit(X_train_transformed, y_train)

    print(f" Best params for {name}:")
    print(grid_search.best_params_)
    print(f" Best CV score (neg MSE): {-grid_search.best_score_:.4f}")

    best_pipeline = grid_search.best_estimator_

    cv_scores = cross_val_score(
        best_pipeline,
        X_train_transformed,
        y_train,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    print(f" Final 5-fold CV RMSE (log scale): {(-cv_scores.mean()) ** 0.5:.4f} ± {cv_scores.std() ** 0.5:.4f}")

    y_pred_log = best_pipeline.predict(X_valid_transformed)

    y_pred = np.expm1(y_pred_log)

    rmse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)

    print(f"{name} on Validation:")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²:   {r2:.4f}")

    preds[name] = y_pred