In [1]:
import numpy as np
import pandas as pd

In [2]:
loaded_data = pd.read_csv('../../data/topic21_v23_train.csv')

loaded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   0                   7501 non-null   float64
 1   1                   7472 non-null   float64
 2   2                   7359 non-null   float64
 3   3                   7556 non-null   float64
 4   4                   7495 non-null   float64
 5   brand               7962 non-null   object 
 6   model               7962 non-null   object 
 7   trim                7951 non-null   object 
 8   body_type           7962 non-null   object 
 9   fuel_type           7962 non-null   object 
 10  transmission_type   7962 non-null   object 
 11  engine_capacity_cc  6362 non-null   object 
 12  horsepower          7584 non-null   object 
 13  exterior_color      7962 non-null   object 
 14  interior_color      7962 non-null   object 
 15  warranty            7962 non-null   object 
 16  city  

In [3]:
def add_outlier_column(df, threshold=2.5):
    df_copy = df.copy()
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).drop('price', axis=1).columns

    df_copy['is_outlier'] = 0
    for col in numeric_cols:
        z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
        df_copy['is_outlier'] = (df_copy['is_outlier'] | (z_scores > threshold)).astype(int)

    return df_copy


In [4]:
df = loaded_data.copy()


def parse_mid(r):
        if pd.isna(r) or r=='Unknown': return np.nan
        s = ''.join(c for c in r if c.isdigit() or c=='-')
        if '-' in s:
            lo, hi = map(int, s.split('-'))
            return (lo+hi)/2
        return float(s)
for col in ['engine_capacity_cc','horsepower']:
    df[col+'_num'] = df[col].apply(parse_mid)
    df[col+'_miss'] = df[col+'_num'].isna().astype(int)
    df[col+'_num'] = df[col+'_num'].fillna(df[col+'_num'].median())
    df.drop(columns=[col], inplace=True)

df['is_automatic'] = (df['transmission_type'] == 'Automatic Transmission').astype(int)
df = df.drop('transmission_type', axis=1)


df['hp_per_cc'] = df['horsepower_num'] / df['engine_capacity_cc_num']



df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   0                        7501 non-null   float64
 1   1                        7472 non-null   float64
 2   2                        7359 non-null   float64
 3   3                        7556 non-null   float64
 4   4                        7495 non-null   float64
 5   brand                    7962 non-null   object 
 6   model                    7962 non-null   object 
 7   trim                     7951 non-null   object 
 8   body_type                7962 non-null   object 
 9   fuel_type                7962 non-null   object 
 10  exterior_color           7962 non-null   object 
 11  interior_color           7962 non-null   object 
 12  warranty                 7962 non-null   object 
 13  city                     7962 non-null   object 
 14  seller_type             

In [5]:
from sklearn.model_selection import train_test_split

train_set, valid_set = train_test_split(df, test_size=0.2, random_state=42)

train_set = add_outlier_column(train_set)
valid_set = add_outlier_column(valid_set)

print("Number of True values train:", train_set['is_outlier'].sum())
print("Number of True values valid:", valid_set['is_outlier'].sum())

for col in ['model']:
    means = train_set.groupby(col)['price'].mean()
    train_set[col + '_enc'] = train_set[col].map(means)
    valid_set[col + '_enc'] = valid_set[col].map(means)

train_set.drop(columns=['model'], inplace=True)
valid_set.drop(columns=['model'], inplace=True)


Number of True values train: 1465
Number of True values valid: 406


In [6]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6369 entries, 2864 to 7270
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   0                        5995 non-null   float64
 1   1                        5997 non-null   float64
 2   2                        5876 non-null   float64
 3   3                        6038 non-null   float64
 4   4                        5997 non-null   float64
 5   brand                    6369 non-null   object 
 6   trim                     6359 non-null   object 
 7   body_type                6369 non-null   object 
 8   fuel_type                6369 non-null   object 
 9   exterior_color           6369 non-null   object 
 10  interior_color           6369 non-null   object 
 11  warranty                 6369 non-null   object 
 12  city                     6369 non-null   object 
 13  seller_type              6369 non-null   object 
 14  price                    6

In [7]:
X_train = train_set.drop(columns=['price'])
y_train = train_set['price']


X_valid = valid_set.drop(columns=['price'])
y_valid = valid_set['price']

In [8]:
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

Numerical features: ['0', '1', '2', '3', '4', 'engine_capacity_cc_num', 'engine_capacity_cc_miss', 'horsepower_num', 'horsepower_miss', 'is_automatic', 'hp_per_cc', 'is_outlier', 'model_enc']
Categorical features: ['brand', 'trim', 'body_type', 'fuel_type', 'exterior_color', 'interior_color', 'warranty', 'city', 'seller_type']


In [9]:
correlation = train_set[numerical_features + ['price']].corr()

print(correlation['price'].sort_values(ascending=False))

price                      1.000000
model_enc                  0.695101
horsepower_num             0.409238
0                          0.273631
engine_capacity_cc_num     0.257073
engine_capacity_cc_miss    0.175946
hp_per_cc                  0.134919
1                          0.019409
is_outlier                 0.014194
is_automatic               0.003505
horsepower_miss           -0.046558
4                         -0.131088
3                         -0.242796
2                         -0.311485
Name: price, dtype: float64


In [10]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])


X_train_transformed = pipeline.fit_transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)


In [11]:

from sklearn.ensemble import ExtraTreesRegressor


models = {
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
}


In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


def train_and_evaluate(models, X_train, y_train, X_valid, y_valid):
    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)

        # predictions = np.expm1(predictions)

        mse = mean_squared_error(y_valid, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_valid, predictions)
        r2 = r2_score(y_valid, predictions)

        results[name] = {
            "MSE": mse,
            "RMSE": rmse,
            "MAE": mae,
            "R2": r2
        }
    return results


In [13]:
results = train_and_evaluate(models, X_train_transformed, y_train, X_valid_transformed, y_valid)
results_df = pd.DataFrame(results).T.sort_values(by="RMSE")
print(results_df)

#                      MSE           RMSE          MAE        R2
# ExtraTrees  1.249172e+10  111766.347187  49114.416529  0.643662



Training ExtraTrees...
                     MSE           RMSE           MAE        R2
ExtraTrees  1.249172e+10  111766.347187  49114.416529  0.643662
