In [40]:
import numpy as np
import pandas as pd

In [41]:
loaded_data = pd.read_csv('../data/topic21_v23_train.csv')

loaded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   0                   7501 non-null   float64
 1   1                   7472 non-null   float64
 2   2                   7359 non-null   float64
 3   3                   7556 non-null   float64
 4   4                   7495 non-null   float64
 5   brand               7962 non-null   object 
 6   model               7962 non-null   object 
 7   trim                7951 non-null   object 
 8   body_type           7962 non-null   object 
 9   fuel_type           7962 non-null   object 
 10  transmission_type   7962 non-null   object 
 11  engine_capacity_cc  6362 non-null   object 
 12  horsepower          7584 non-null   object 
 13  exterior_color      7962 non-null   object 
 14  interior_color      7962 non-null   object 
 15  warranty            7962 non-null   object 
 16  city  

In [42]:
def remove_outliers(df, threshold=3):
    df_copy = df.copy()
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    z_scores = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std())
    outliers = (z_scores > threshold).any(axis=1)

    print(f"Number of outliers removed: {outliers.sum()}")

    return df_copy[~outliers]


In [43]:
df = loaded_data.copy()


def parse_mid(r):
        if pd.isna(r) or r=='Unknown': return np.nan
        s = ''.join(c for c in r if c.isdigit() or c=='-')
        if '-' in s:
            lo, hi = map(int, s.split('-'))
            return (lo+hi)/2
        return float(s)
for col in ['engine_capacity_cc','horsepower']:
    df[col+'_num'] = df[col].apply(parse_mid)
    df[col+'_miss'] = df[col+'_num'].isna().astype(int)
    df[col+'_num'] = df[col+'_num'].fillna(df[col+'_num'].median())
    df.drop(columns=[col], inplace=True)

df['is_automatic'] = (df['transmission_type'] == 'Automatic Transmission').astype(int)
df = df.drop('transmission_type', axis=1)

# df['seller_is_owner'] = (df['seller_type'] == 'Owner').astype(int)
# df['seller_is_dealer'] = (df['seller_type'] == 'Dealer').astype(int)
# df['seller_is_cpo']   = (df['seller_type'] == 'Dealership/Certified Pre-Owned').astype(int)
# df = df.drop('seller_type', axis=1)

df['hp_per_cc'] = df['horsepower_num'] / df['engine_capacity_cc_num']


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   0                        7501 non-null   float64
 1   1                        7472 non-null   float64
 2   2                        7359 non-null   float64
 3   3                        7556 non-null   float64
 4   4                        7495 non-null   float64
 5   brand                    7962 non-null   object 
 6   model                    7962 non-null   object 
 7   trim                     7951 non-null   object 
 8   body_type                7962 non-null   object 
 9   fuel_type                7962 non-null   object 
 10  exterior_color           7962 non-null   object 
 11  interior_color           7962 non-null   object 
 12  warranty                 7962 non-null   object 
 13  city                     7962 non-null   object 
 14  seller_type             

In [44]:
from sklearn.model_selection import train_test_split


train_set , valid_set = train_test_split(df, test_size=0.2, random_state=42)

# train_set = remove_outliers(train_set)


for col in ['model']:
    means = train_set.groupby(col)['price'].mean()
    train_set[col + '_enc'] = train_set[col].map(means)
    valid_set[col + '_enc'] = valid_set[col].map(means)

train_set.drop(columns=['model'], inplace=True)
valid_set.drop(columns=['model'], inplace=True)


In [45]:
X_train = train_set.drop(columns=['price'])
y_train = train_set['price']

# y_train = np.log1p(y_train)

X_valid = valid_set.drop(columns=['price'])
y_valid = valid_set['price']

In [46]:
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

Numerical features: ['0', '1', '2', '3', '4', 'engine_capacity_cc_num', 'engine_capacity_cc_miss', 'horsepower_num', 'horsepower_miss', 'is_automatic', 'hp_per_cc', 'model_enc']
Categorical features: ['brand', 'trim', 'body_type', 'fuel_type', 'exterior_color', 'interior_color', 'warranty', 'city', 'seller_type']


In [47]:
correlation = train_set[numerical_features + ['price']].corr()

print(correlation['price'].sort_values(ascending=False))

price                      1.000000
model_enc                  0.695101
horsepower_num             0.409238
0                          0.273631
engine_capacity_cc_num     0.257073
engine_capacity_cc_miss    0.175946
hp_per_cc                  0.134919
1                          0.019409
is_automatic               0.003505
horsepower_miss           -0.046558
4                         -0.131088
3                         -0.242796
2                         -0.311485
Name: price, dtype: float64


In [48]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])


X_train_transformed = pipeline.fit_transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)


In [49]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

models = {
    "ExtraTrees": ExtraTreesRegressor(random_state=42),

    # "SVR": SVR(),
    # "RandomForest": RandomForestRegressor(random_state=42),
    # "XGBoost": XGBRegressor(verbosity=0, random_state=42),
    # "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    #
    # "LinearRegression": LinearRegression(),
    # "Ridge": Ridge(),
    # "Lasso": Lasso(),
    # "ElasticNet": ElasticNet(),
    # "DecisionTree": DecisionTreeRegressor(random_state=42),
    #
    # "GradientBoosting": GradientBoostingRegressor(),
    # "AdaBoost": AdaBoostRegressor(),
    # "KNeighbors": KNeighborsRegressor(),
    # "MLP": MLPRegressor(max_iter=1000),
    # "LightGBM": LGBMRegressor()
}


In [50]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


def train_and_evaluate(models, X_train, y_train, X_valid, y_valid):
    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)

        # predictions = np.expm1(predictions)

        mse = mean_squared_error(y_valid, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_valid, predictions)
        r2 = r2_score(y_valid, predictions)

        results[name] = {
            "MSE": mse,
            "RMSE": rmse,
            "MAE": mae,
            "R2": r2
        }
    return results


In [51]:
results = train_and_evaluate(models, X_train_transformed, y_train, X_valid_transformed, y_valid)
results_df = pd.DataFrame(results).T.sort_values(by="RMSE")
print(results_df)

#                      MSE           RMSE          MAE        R2
# ExtraTrees  1.252300e+10  111906.224356  49226.051519  0.642769



Training ExtraTrees...
                     MSE           RMSE           MAE        R2
ExtraTrees  1.252300e+10  111906.224356  49226.051519  0.642769


In [52]:
models = {
    # "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    # "CatBoost": CatBoostRegressor(verbose=0, random_state=42)
}

param_grids = {
    # "RandomForest": {
    #     'pca__n_components': [5, 10, 15, 19],
    #     'model__n_estimators': [100, 300, 500, 800],
    #     'model__max_depth': [None, 10, 20, 30, 50],
    #     'model__min_samples_split': [2, 5, 10],
    #     'model__min_samples_leaf': [1, 2, 4],
    #     'model__max_features': ['sqrt', 'log2', None]
    # },
    "ExtraTrees": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]
    },
    # "CatBoost": {
    #     'pca__n_components': [5, 10, 15, 19],
    #     'model__iterations': [500, 1000],
    #     'model__depth': [4, 6, 8, 10],
    #     'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    #     'model__l2_leaf_reg': [1, 3, 5, 7, 9],
    #     'model__bagging_temperature': [0, 0.5, 1]
    # }
}


In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV

preds = {}

pplines = {}
for name, model in models.items():
    print(f"\n GridSearch for {name}...")

    pipeline = Pipeline([
        ('pca', PCA()),
        ('model', model)
    ])

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[name],
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=2,
        n_jobs=-1
    )

    grid_search.fit(X_train_transformed, y_train)

    print(f" Best params for {name}:")
    print(grid_search.best_params_)
    print(f" Best CV score (neg MSE): {-grid_search.best_score_:.4f}")

    best_pipeline = grid_search.best_estimator_

    cv_scores = cross_val_score(
        best_pipeline,
        X_train_transformed,
        y_train,
        cv=5,
        scoring='r2',
        n_jobs=-1
    )
    print(f" Final 5-fold CV R2: {(-cv_scores.mean()) ** 0.5:.4f} ± {cv_scores.std() ** 0.5:.4f}")

    y_pred = best_pipeline.predict(X_valid_transformed)

    rmse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)

    preds[name] = y_pred
    pplines[name] = best_pipeline


 GridSearch for ExtraTrees...
Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


In [20]:

for name, y_pred in preds.items():
    rmse = mean_squared_error(y_valid, y_pred)
    rmse = np.sqrt(rmse)
    mae = mean_absolute_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)

    print(f"{name} on Validation:")
    print(f"RMSE: {rmse:.4f}")
    print(f"RMSE: {mae:.4f}")
    print(f"R²:   {r2:.4f}")
    print()


ExtraTrees on Validation:
RMSE: 118259.3135
RMSE: 57880.8490
R²:   0.6011

