In [14]:
import numpy as np
import pandas as pd

In [15]:
loaded_data = pd.read_csv('../../data/topic21_v23_train.csv')

loaded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   0                   7501 non-null   float64
 1   1                   7472 non-null   float64
 2   2                   7359 non-null   float64
 3   3                   7556 non-null   float64
 4   4                   7495 non-null   float64
 5   brand               7962 non-null   object 
 6   model               7962 non-null   object 
 7   trim                7951 non-null   object 
 8   body_type           7962 non-null   object 
 9   fuel_type           7962 non-null   object 
 10  transmission_type   7962 non-null   object 
 11  engine_capacity_cc  6362 non-null   object 
 12  horsepower          7584 non-null   object 
 13  exterior_color      7962 non-null   object 
 14  interior_color      7962 non-null   object 
 15  warranty            7962 non-null   object 
 16  city  

In [16]:
def remove_outliers(df, threshold=2.5):
    df_clean = df.copy()
    initial_rows = len(df)
    numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns

    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    removed_rows = initial_rows - len(df_clean)
    print(f"Removed {removed_rows} rows ({(removed_rows / initial_rows) * 100:.2f}% of data)")

    return df_clean


In [17]:
from sklearn.model_selection import train_test_split

loaded_data = loaded_data.dropna(subset=['0', '1', '2', '3', '4', 'engine_capacity_cc'])
# loaded_data = loaded_data.dropna()
loaded_data = remove_outliers(loaded_data)

loaded_data.info()

Removed 382 rows (8.21% of data)
<class 'pandas.core.frame.DataFrame'>
Index: 4272 entries, 0 to 7960
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   0                   4272 non-null   float64
 1   1                   4272 non-null   float64
 2   2                   4272 non-null   float64
 3   3                   4272 non-null   float64
 4   4                   4272 non-null   float64
 5   brand               4272 non-null   object 
 6   model               4272 non-null   object 
 7   trim                4268 non-null   object 
 8   body_type           4272 non-null   object 
 9   fuel_type           4272 non-null   object 
 10  transmission_type   4272 non-null   object 
 11  engine_capacity_cc  4272 non-null   object 
 12  horsepower          4225 non-null   object 
 13  exterior_color      4272 non-null   object 
 14  interior_color      4272 non-null   object 
 15  warranty            4272 no

In [18]:
df = loaded_data.copy()

def extract_range_mean(val):
    try:
        nums = [int(s) for s in val.replace('cc','').replace('HP','').split('-')]
        return np.mean(nums)
    except:
        return np.nan

df['engine_capacity'] = df['engine_capacity_cc'].apply(extract_range_mean)
df['horsepower_val'] = df['horsepower'].apply(extract_range_mean)
df.drop(columns=['engine_capacity_cc', 'horsepower'], inplace=True)

# Target mean encoding manually
for col in ['brand', 'model', 'trim']:
    means = df.groupby(col)['price'].mean()
    df[col + '_enc'] = df[col].map(means)

# Drop original high-cardinality categorical columns
df.drop(columns=['brand', 'model', 'trim'], inplace=True)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4272 entries, 0 to 7960
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   0                  4272 non-null   float64
 1   1                  4272 non-null   float64
 2   2                  4272 non-null   float64
 3   3                  4272 non-null   float64
 4   4                  4272 non-null   float64
 5   body_type          4272 non-null   object 
 6   fuel_type          4272 non-null   object 
 7   transmission_type  4272 non-null   object 
 8   exterior_color     4272 non-null   object 
 9   interior_color     4272 non-null   object 
 10  warranty           4272 non-null   object 
 11  city               4272 non-null   object 
 12  seller_type        4272 non-null   object 
 13  price              4272 non-null   int64  
 14  engine_capacity    3169 non-null   float64
 15  horsepower_val     3910 non-null   float64
 16  brand_enc          4272 non-n

In [20]:
X = df.drop(columns=['price'])
y = df['price']

# X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'X_train shape = {X_train.shape}')

X_train shape = (3417, 18)


In [21]:
# y_train = np.log1p(y_train)

In [22]:
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

Numerical features: ['0', '1', '2', '3', '4', 'engine_capacity', 'horsepower_val', 'brand_enc', 'model_enc', 'trim_enc']
Categorical features: ['body_type', 'fuel_type', 'transmission_type', 'exterior_color', 'interior_color', 'warranty', 'city', 'seller_type']


In [23]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train_transformed = pipeline.fit_transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)


In [24]:
from sklearn.ensemble import  ExtraTreesRegressor
from catboost import CatBoostRegressor

models = {
    # "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    # "XGBoost": XGBRegressor(verbosity=0, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
}

In [25]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


def train_and_evaluate(models, X_train, y_train, X_valid, y_valid):
    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)

        # predictions = np.expm1(predictions)

        mse = mean_squared_error(y_valid, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_valid, predictions)
        r2 = r2_score(y_valid, predictions)

        results[name] = {
            "MSE": mse,
            "RMSE": rmse,
            "MAE": mae,
            "R2": r2
        }
    return results


In [26]:
results = train_and_evaluate(models, X_train_transformed, y_train, X_valid_transformed, y_valid)
results_df = pd.DataFrame(results).T.sort_values(by="RMSE")
print(results_df)

#                      MSE          RMSE           MAE        R2
# CatBoost    1.956715e+09  44234.776978  26361.880504  0.855848
# ExtraTrees  2.240080e+09  47329.480620  25994.048573  0.834973


Training ExtraTrees...
Training CatBoost...
                     MSE          RMSE           MAE        R2
CatBoost    1.865906e+09  43196.132857  25924.038388  0.862538
ExtraTrees  2.256342e+09  47500.965423  26035.063345  0.833775
