In [1]:
import numpy as np
import pandas as pd
from splitter import splitter

X_train, X_test, y_train, y_test = splitter(".//topic21_v9_train.csv")

# Elina's model

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer
from sklearn.experimental import enable_iterative_imputer  # needed to enable
from sklearn.impute import IterativeImputer # for the actual model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import IsolationForest 
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV

In [3]:
# helper functions that select numeric and categorical columns

def select_num_columns(df):
    return df.select_dtypes(include=np.number).columns.tolist()

def select_cat_columns(df):
    return df.select_dtypes(include="object").columns.tolist()

## Feature engineering

In [4]:
# define feature engineering functions and make transformers out of them

# 1. Add categorical combinations
# This function creates new features by combining 'brand' and 'trim', and 'model' and 'trim'.
def add_cat_combos(df):
    df = df.copy()

    for col in ['brand', 'model', 'trim']:
        df[col] = df[col].replace('Other', np.nan)

    df['brand_trim'] = df['brand'].astype(str) + '_' + df['trim'].astype(str)
    df['model_trim'] = df['model'].astype(str) + '_' + df['trim'].astype(str)
    return df

cat_combos = FunctionTransformer(add_cat_combos, validate=False)



# 2. Pairwise numeric interactions
# This function creates new features by multiplying and dividing pairs of numeric columns.
def add_numeric_interactions(df):
    df = df.copy()
    pairs = [('1','2'), ('2','4'), ('1','4'), ('0','3')]

    for a, b in pairs:
        a_f, b_f = df[a].astype(float), df[b].astype(float)
        a_ft, b_ft = df[a].astype(float), df[b].astype(float)

        df[f'{a}_x_{b}'] = a_f * b_f
        df[f'{a}_x_{b}'] = a_ft * b_ft

        df[f'{a}_over_{b}'] = a_f / (b_f + 1e-6)
        df[f'{a}_over_{b}'] = a_ft / (b_ft + 1e-6)

        df[f'{a}_recipprod_{b}'] = 1.0 / (a_f * b_f + 1e-6)
        df[f'{a}_recipprod_{b}'] = 1.0 / (a_ft * b_ft + 1e-6)

    return df

numeric_interactions = FunctionTransformer(add_numeric_interactions, validate=False)



# 3. Simple polynomial terms
# This function adds squared terms for the first 5 numeric columns.
def add_simple_polynomial_terms(df):
    df = df.copy()

    for i in range(5):
        df[f'{i}_sq'] = df[f'{i}'].astype(float) ** 2

    return df

simple_polynomial_terms = FunctionTransformer(add_simple_polynomial_terms, validate=False)



# 4. Count-based aggregate features
# This function adds count features for 'brand' and 'model', counting occurrences in the dataset.
def add_count_features(df):
    df = df.copy()

    for cat in ['brand', 'model']:
        counts = df[cat].value_counts()
        df[f'{cat}_count'] = df[cat].map(counts)

    return df

count_features = FunctionTransformer(add_count_features, validate=False)

# Combine all feature engineering steps into a single pipeline

feature_engineering = Pipeline([
    ("cat_combos", cat_combos),
    ("numeric_interactions", numeric_interactions),
    ("simple_polynomial_terms", simple_polynomial_terms),
    ("count_features", count_features)
])

## Preprocessing
(has to be done after FE cz it changes the data frame into an array, but we need to work with df to do FE)

In [5]:
num_transformer = Pipeline([
   ("imputer", SimpleImputer(strategy='mean')),
    ("scaler", StandardScaler())
])



cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])


num_cols = select_num_columns(X_train) + ["1_x_2", "2_x_4", "1_x_4", "0_x_3",
                                          "1_over_2", "2_over_4", "1_over_4", "0_over_3",
                                          "1_recipprod_2", "2_recipprod_4", "1_recipprod_4", "0_recipprod_3",
                                          "0_sq", "1_sq", "2_sq", "3_sq", "4_sq", 'brand_count', 'model_count']

cat_cols = select_cat_columns(X_train) + ['brand_trim', 'model_trim']



transformer = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols),
])

## Final Pipeline

In [6]:
pipeline = Pipeline([
    ("feature_engineering", feature_engineering),
    ("transformer", transformer),
    ("model", LinearRegression())
])

## Fit the model using cross validation 

In [7]:
cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.69375377 0.70132875 0.69651251 0.69985633 0.7021918 ]
Test R^2 scores: [ -29.97477548    0.48202786   -1.51565213 -309.20082827    0.51153875]


# Build a model from scratch
NOTE: I don't redefine parts of the code (f.e. cat_processor) in each new pipeline if I am not bringing any changes into it

## Data frame for storing results of cv 
after adding each new thing into the pipeline

In [5]:
# initialize the dataframe 
results_df = pd.DataFrame({
    'r2_mean_train': [],
    'r2_mean_test': [],
    'r2_std_train': [],
    'r2_std_test': []
})

# define a function that will add a row to the results_df with new results
def update_result(cv):
    global results_df

    results_df.loc[len(results_df)] = [
        cv['train_score'].mean(),
        cv['test_score'].mean(),
        cv['train_score'].std(),
        cv['test_score'].std()
    ]

    return results_df

## Baseline model

In [6]:
num_processor = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_processor = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.6528719  0.6597142  0.65714568 0.66187927 0.66109304]
Test R^2 scores: [0.49300481 0.49130671 0.48719121 0.48555077 0.50481921]


In [7]:
update_result(cv)

Unnamed: 0,r2_mean_train,r2_mean_test,r2_std_train,r2_std_test
0,0.658541,0.492375,0.003259,0.00678


## Detect outliers with Isolation Forest

### Buggy version
The issue is that this version violates the imoportant rule of sklearn: 
    all transformers in a Pipeline and ColumnTransformer must preserve the number of rows

In [9]:
from sklearn.ensemble import IsolationForest

def iforest_func(X): 
    num_train = X.select_dtypes(include=np.number)

    # Temporarily impute missing values in numerical features before applying Isolation Forest
    num_temp = SimpleImputer(strategy='median').fit_transform(num_train)  # median is robust to outliers

    num_train['outliers'] = IsolationForest(random_state=42).fit_predict(num_temp) == -1

   # drop outliers from the training set
    num_train = num_train[~num_train['outliers']]

    # drop the outliers column
    return num_train.drop(columns=['outliers'])


iforest = FunctionTransformer(
    iforest_func,
    validate=False
)

num_processor = Pipeline([
    ("iforest", iforest),  # detect outliers
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 944, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1058, in _hstack
    return sparse.hstack(converted_Xs).tocsr()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 756, in hstack
    return _block([blocks], format, dtype, return_spmatrix=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 971, in _block
    raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5095, expected 4626.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 944, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1058, in _hstack
    return sparse.hstack(converted_Xs).tocsr()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 756, in hstack
    return _block([blocks], format, dtype, return_spmatrix=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 971, in _block
    raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5095, expected 4635.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 944, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1058, in _hstack
    return sparse.hstack(converted_Xs).tocsr()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 756, in hstack
    return _block([blocks], format, dtype, return_spmatrix=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 971, in _block
    raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5095, expected 4640.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 944, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1058, in _hstack
    return sparse.hstack(converted_Xs).tocsr()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 756, in hstack
    return _block([blocks], format, dtype, return_spmatrix=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 971, in _block
    raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5095, expected 4643.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 944, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1058, in _hstack
    return sparse.hstack(converted_Xs).tocsr()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 756, in hstack
    return _block([blocks], format, dtype, return_spmatrix=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 971, in _block
    raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5096, expected 4576.


### Clean version
#### Set outliers to NaN

In [10]:
from sklearn.ensemble import IsolationForest

def iforest_func(X): 
    num_train = X.select_dtypes(include=np.number)

    # Temporarily impute missing values in numerical features before applying Isolation Forest
    num_temp = SimpleImputer(strategy='median').fit_transform(num_train)  # median is robust to outliers

    num_train['outliers'] = IsolationForest(random_state=42).fit_predict(num_temp) == -1

   # set the outliers to NaN
    num_train.loc[num_train['outliers'], :] = np.nan

    # drop the outliers column
    return num_train.drop(columns=['outliers'])


iforest = FunctionTransformer(
    iforest_func,
    validate=False
)

num_processor = Pipeline([
    ("iforest", iforest),  # detect outliers
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan


Cross-validation results:
Train R^2 scores: [0.66733497 0.67469692 0.67091371 0.67696587 0.67561404]
Test R^2 scores: [0.51651607 0.50734882 0.51380647 0.50609148 0.52441071]


  num_train.loc[num_train['outliers'], :] = np.nan


In [11]:
update_result(cv)

Unnamed: 0,r2_mean_train,r2_mean_test,r2_std_train,r2_std_test
0,0.658541,0.492375,0.003259,0.00678
1,0.673105,0.513635,0.003517,0.006646


# Fill in NaN

In [10]:


num_processor = Pipeline([
    ("iforest", iforest),  # detect outliers
    ("imputer", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10), max_iter=10, random_state=0)),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.68168793 0.68882165 0.68446263 0.68664783 0.68911034]
Test R^2 scores: [0.49449894 0.49135105 0.51597223 0.5148079  0.51407056]


In [14]:
update_result(cv)

Unnamed: 0,r2_mean_train,r2_mean_test,r2_std_train,r2_std_test
0,0.658541,0.492375,0.003259,0.00678
1,0.673105,0.513635,0.003517,0.006646
2,0.673944,0.516754,0.003623,0.00934


# Unite rare categories in OneHotEncoding
!!! Made the results worth. Should be NOT encluded in the final model

### Create a dictionary with column names and carresponding max_categ for OneHotEncoder

In [None]:
train_categ = X_train.select_dtypes(include='object').reset_index(drop=True)

diverce_categ = train_categ.columns[train_categ.nunique() > 20]

# dictionary with the threshold for OneHotEncoder
max_categ = {}


for col in diverce_categ:

    unique = train_categ[col].value_counts().reset_index()

    unique['cumulative_sum'] = unique['count'].cumsum()
    unique['cumulative_percentage'] = unique['cumulative_sum'] / unique['cumulative_sum'].iloc[-1]

    # threshold to filter the least common features which contribute up to 5% in total
    threshold = unique[unique['cumulative_percentage'] > 0.95].iloc[:, 0].count()

    # save the frequent features to dict
    max_categ[col] = threshold

            brand  count  cumulative_sum  cumulative_percentage
0   Mercedes-Benz    973             973               0.152771
1             BMW    564            1537               0.241325
2          Nissan    463            2000               0.314021
3      Land Rover    437            2437               0.382635
4          Toyota    430            2867               0.450149
..            ...    ...             ...                    ...
80      King Long      1            6365               0.999372
81      SsangYong      1            6366               0.999529
82          Avatr      1            6367               0.999686
83          Exeed      1            6368               0.999843
84         Pagani      1            6369               1.000000

[85 rows x 4 columns]
                 model  count  cumulative_sum  cumulative_percentage
0          Range Rover    179             179               0.028105
1              S-Class    145             324               0.050871
2 

### Build the pipeline

In [None]:
ohe_diverse = []

for col, max_cat in max_categ.items():
    encoder = OneHotEncoder(handle_unknown='infrequent_if_exist',
                            max_categories=max_cat,
                            sparse_output=False) #?


    ohe_diverse.append((
        f"ohe_{col}",
        encoder,
        [col]
    ))

encoder = ColumnTransformer(ohe_diverse + [
    ("ohe_other", OneHotEncoder(handle_unknown='ignore', sparse_output=False), X_train.select_dtypes(include='object').columns.difference(list(max_categ.keys())).tolist())
])

def restore_df(X, columns):
    return pd.DataFrame(X, columns=columns)

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="constant", fill_value="missing")), # returns np.array, so we need to convert it to df to OneHotEncode it
    ('to_df', FunctionTransformer(lambda X: pd.DataFrame(X, columns=select_cat_columns(X_train)), validate=False)),
    ('encoder', encoder)
])

preprocessor = ColumnTransformer([
    ('num', num_processor, select_num_columns(X_train)),
    ('cat', cat_pipeline, select_cat_columns(X_train))
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan


Cross-validation results:
Train R^2 scores: [0.56311609 0.57851138 0.56674271 0.5847706  0.58094982]
Test R^2 scores: [-2.60073958e+19  4.92530742e-01 -1.97695160e+19  4.49164610e-01
  4.59839917e-01]


In [None]:
# update_result(cv)

# Feature engineering
## 1/log(x)

In [9]:
num_processor = Pipeline([
    ("log_transform", FunctionTransformer(np.log1p, validate=False)),  # log(x) transformation
    ("iforest", iforest),  # detect outliers
    ("imputer", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10), max_iter=10, random_state=0)),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


Cross-validation results:
Train R^2 scores: [0.69007188 0.69931661 0.69502919 0.69513434 0.6960254 ]
Test R^2 scores: [0.50081551 0.48985819 0.49306025 0.52304367 0.53527095]


In [16]:
update_result(cv)

Unnamed: 0,r2_mean_train,r2_mean_test,r2_std_train,r2_std_test
0,0.658541,0.492375,0.003259,0.00678
1,0.673105,0.513635,0.003517,0.006646
2,0.673944,0.516754,0.003623,0.00934
3,0.677433,0.52646,0.003103,0.010404


## PCA
didn't improve, don't include

In [19]:
from sklearn.decomposition import PCA

num_processor = Pipeline([
    ("log_transform", FunctionTransformer(np.log1p, validate=False)),  # log(x) transformation
    ("iforest", iforest),  # detect outliers
    ("imputer", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10), max_iter=10, random_state=0)),
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=4, random_state=42))
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result =

Cross-validation results:
Train R^2 scores: [0.67111767 0.67533059 0.67485146 0.68035202 0.67575295]
Test R^2 scores: [0.52369204 0.51050632 0.52826449 0.52205562 0.54549597]


In [20]:
update_result(cv)

Unnamed: 0,r2_mean_train,r2_mean_test,r2_std_train,r2_std_test
0,0.658541,0.492375,0.003259,0.00678
1,0.673105,0.513635,0.003517,0.006646
2,0.673944,0.516754,0.003623,0.00934
3,0.677433,0.52646,0.003103,0.010404
4,0.673662,0.517252,0.003046,0.011102
5,0.675481,0.526003,0.002941,0.011368


## Brand + Model, Model + Trim

In [8]:
num_processor = Pipeline([
    ("log_transform", FunctionTransformer(np.log1p, validate=False)),  # log(x) transformation
    ("iforest", iforest),  # detect outliers
    ("imputer", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10), max_iter=10, random_state=0)),
    ("scaler", StandardScaler())
])




def add_cat_combos(df):
    df = df.copy()

    for col in ['brand', 'model', 'trim']:
        df[col] = df[col].replace('Other', np.nan)

    df['brand_trim'] = df['brand'].astype(str) + '_' + df['trim'].astype(str)
    df['model_trim'] = df['model'].astype(str) + '_' + df['trim'].astype(str)
    return df

cat_combos = FunctionTransformer(add_cat_combos, validate=False)


cat_processor = Pipeline([
    ("cat_combos", cat_combos),
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])




preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


Cross-validation results:
Train R^2 scores: [0.68984397 0.69937381 0.69540601 0.69523472 0.69707491]
Test R^2 scores: [0.50192076 0.48912288 0.49322317 0.52358645 0.53627233]


In [23]:
update_result(cv)

Unnamed: 0,r2_mean_train,r2_mean_test,r2_std_train,r2_std_test
0,0.658541,0.492375,0.003259,0.00678
1,0.673105,0.513635,0.003517,0.006646
2,0.673944,0.516754,0.003623,0.00934
3,0.677433,0.52646,0.003103,0.010404
4,0.673662,0.517252,0.003046,0.011102
5,0.675481,0.526003,0.002941,0.011368
6,0.70237,0.537587,0.002421,0.014471


## Drop transmission type, unite waranties types: No + Does not apply 

In [10]:
def iforest_func(X): 
    num_train = X.select_dtypes(include=np.number)
    num_temp = SimpleImputer(strategy='median').fit_transform(num_train)
    num_train['outliers'] = IsolationForest(random_state=42).fit_predict(num_temp) == -1
    num_train.loc[num_train['outliers'], :] = np.nan
    return num_train.drop(columns=['outliers'])



num_processor = Pipeline([
    ("log_transform", FunctionTransformer(np.log1p, validate=False)),
    ("iforest", FunctionTransformer(iforest_func, validate=False)),
    ("imputer", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, n_jobs=-1), max_iter=10, random_state=0)),
    ("scaler", StandardScaler())
])

cat_processor = Pipeline([
    ("drop_transmission", FunctionTransformer(lambda df: df.drop(columns=['transmission'], errors='ignore'), validate=False)),
    ("unite_warranties", FunctionTransformer(lambda df: df.replace({'warranty': {'No': 'Does not apply'}}), validate=False)),
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result =

Cross-validation results:
Train R^2 scores: [0.67210794 0.67696924 0.67504914 0.68076355 0.67870306]
Test R^2 scores: [0.52516173 0.51393789 0.52393653 0.52193179 0.54415826]


In [11]:
update_result(cv)

Unnamed: 0,r2_mean_train,r2_mean_test,r2_std_train,r2_std_test
0,0.658541,0.492375,0.003259,0.00678
1,0.676719,0.525825,0.00298,0.009966


## TargetEncoder

In [None]:
from sklearn.preprocessing import TargetEncoder

num_processor = Pipeline([
    ("log_transform", FunctionTransformer(np.log1p, validate=False)),
    ("iforest", FunctionTransformer(iforest_func, validate=False)),
    ("imputer", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, n_jobs=-1), max_iter=10, random_state=0)),
    ("scaler", StandardScaler())
])

cat_processor = Pipeline([
    ("drop_transmission", FunctionTransformer(lambda df: df.drop(columns=['transmission'], errors='ignore'), validate=False)),
    ("unite_warranties", FunctionTransformer(lambda df: df.replace({'warranty': {'No': 'Does not apply'}}), validate=False)),
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", TargetEncoder(random_state=42, cv=3, smooth=True))  # TargetEncoder with cross-validation and smoothing),
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=3, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result =

Cross-validation results:
Train R^2 scores: [-5.75698255e+09 -3.26726737e+09 -2.03329401e+10 -4.83301387e+10
 -5.90453340e+09]
Test R^2 scores: [-2.81206696e+09 -1.92337661e+09 -9.63686326e+09 -3.27824722e+10
 -5.18166707e+09]


## Balancing Target and OneHot encodings

encode categories under 100: with NaN as OneHot, without Nan: Dummy; more than 100 as Target
Still did not work

In [20]:
cat_target_processor = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", TargetEncoder(random_state=42, cv=3, smooth=True))  # TargetEncoder with cross-validation and smoothing),
])

cat_onehot_processor = Pipeline([
    ("drop_transmission", FunctionTransformer(lambda df: df.drop(columns=['transmission'], errors='ignore'), validate=False)),
    ("unite_warranties", FunctionTransformer(lambda df: df.replace({'warranty': {'No': 'Does not apply'}}), validate=False)),
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # OneHotEncoder for categories with less than 100 unique values
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat_onehot', cat_processor, X_train.select_dtypes(include="object").columns[X_train.select_dtypes(include="object").nunique() < 100]),
        ('cat_target', cat_target_processor, X_train.select_dtypes(include="object").columns[X_train.select_dtypes(include="object").nunique() >= 100])
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=3, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan


Cross-validation results:
Train R^2 scores: [-3.66664778e+11 -1.85956705e+10 -1.50424435e+11]
Test R^2 scores: [-3.25813386e+11 -1.41494422e+10 -1.62154406e+11]


In [None]:
X_train.select_dtypes(include="object").columns[X_train.select_dtypes(include="object").nunique() < 100]

Index(['brand', 'body_type', 'fuel_type', 'transmission_type',
       'engine_capacity_cc', 'horsepower', 'exterior_color', 'interior_color',
       'warranty', 'city', 'seller_type'],
      dtype='object')

## RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

def iforest_func(X): 
    num_train = X.select_dtypes(include=np.number)
    num_temp = SimpleImputer(strategy='median').fit_transform(num_train)
    num_train['outliers'] = IsolationForest(random_state=42).fit_predict(num_temp) == -1
    num_train.loc[num_train['outliers'], :] = np.nan
    return num_train.drop(columns=['outliers'])


num_processor = Pipeline([
    ("log_transform", FunctionTransformer(np.log1p, validate=False)),
    ("iforest", FunctionTransformer(iforest_func, validate=False)),
    ("imputer", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, n_jobs=-1), max_iter=10, random_state=0)),
    ("scaler", StandardScaler())
])

cat_processor = Pipeline([
    ("drop_transmission", FunctionTransformer(lambda df: df.drop(columns=['transmission'], errors='ignore'), validate=False)),
    ("unite_warranties", FunctionTransformer(lambda df: df.replace({'warranty': {'No': 'Does not apply'}}), validate=False)),
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=1000, random_state=42, n_jobs=-1, max_depth=10, min_samples_split=20, min_samples_leaf=10))  # RandomForestRegressor with hyperparameters
])

cv = cross_validate(pipeline, X_train, y_train, cv=3, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan


Cross-validation results:
Train R^2 scores: [0.59696873 0.58318758 0.61263432]
Test R^2 scores: [0.4358801  0.48337795 0.43765833]


### Lasso & Ridge

In [None]:
# lasso
def iforest_func(X): 
    num_train = X.select_dtypes(include=np.number)

    # Temporarily impute missing values in numerical features before applying Isolation Forest
    num_temp = SimpleImputer(strategy='median').fit_transform(num_train)  # median is robust to outliers

    num_train['outliers'] = IsolationForest(random_state=42).fit_predict(num_temp) == -1

   # set the outliers to NaN
    num_train.loc[num_train['outliers'], :] = np.nan

    # drop the outliers column
    return num_train.drop(columns=['outliers'])


iforest = FunctionTransformer(
    iforest_func,
    validate=False
)

num_processor = Pipeline([
    ("log_transform", FunctionTransformer(np.log1p, validate=False)),  # log(x) transformation
    ("iforest", iforest),  # detect outliers
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])


cat_processor = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("lasso", Lasso(max_iter=10000, random_state=42))
])



# Grid search for hyperparameter tuning

alphas = [10**expo for expo in [-2, -1, 0, 1, 2]]

param_grid = {"lasso__alpha" : alphas}



grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2', return_train_score=True)
grid.fit(X_train, y_train)

print("Best CV score:", grid.best_score_)
print("Best parameter:", grid.best_params_)
print("Train set score:", grid.score(X_train, y_train))
print("Test set score:", grid.score(X_test, y_test))

  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  model = cd_fast.sparse_enet_coordinate_descent(
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  model = cd_fast.sparse_enet_coordinate_descent(
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  model = cd_fast.sparse_enet_coordinate_descent(
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
  num_train.loc[num_train['outliers'], :] = np.nan
  result = func(self.values, **kwargs)
 

In [25]:
update_result(cv)

Unnamed: 0,r2_mean_train,r2_mean_test,r2_std_train,r2_std_test
0,0.658541,0.492375,0.003259,0.00678
1,0.673105,0.513635,0.003517,0.006646
2,0.673944,0.516754,0.003623,0.00934
3,0.677433,0.52646,0.003103,0.010404
4,0.673662,0.517252,0.003046,0.011102
5,0.675481,0.526003,0.002941,0.011368
6,0.70237,0.537587,0.002421,0.014471
7,0.702213,0.5355,0.003143,0.014233


In [None]:


cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

## 0 reciprocated 3
### with all this fancy preprocessing

In [15]:
def iforest_func(X): 
    num_train = X.select_dtypes(include=np.number)
    num_temp = SimpleImputer(strategy='median').fit_transform(num_train)  # median is robust to outliers
    num_train['outliers'] = IsolationForest(random_state=42, n_jobs=-1).fit_predict(num_temp) == -1
    return num_train.drop(columns=['outliers'])


iforest = FunctionTransformer(
    iforest_func,
    validate=False
)


def reciprocated_0_3(X):
    X[f'{0}_recipprod_{3}'] = 1.0 / (X['0'] * X['3'] + 1e-6)
    return X

num_processor = Pipeline([
    ("reciprocated", FunctionTransformer(reciprocated_0_3)),
    ("iforest", iforest),  # detect outliers
    ("imputer", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, n_jobs=-1), max_iter=10, random_state=0)),
    ("scaler", StandardScaler())
])


cat_processor = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=3, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.68301959 0.67401128 0.69343558]
Test R^2 scores: [0.47092167 0.48426729 0.45037387]


### without outlier detection, with simple imputer

In [None]:
num_processor = Pipeline([
    ("drop_columns", FunctionTransformer(drop()))
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler())
])


cat_processor = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=3, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.66813404 0.66216348 0.67948334]
Test R^2 scores: [0.45450297 0.46247777 0.43136808]


I am not responsible for anything that is happening below this message. If you decide to continue reading, you hold the complete responsibility for the demage that might be caused to you. You were warned.

In [None]:
num_processor = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_processor = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\linear_model\_base.py", line 578, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\validation.py", line 1263, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\validation.py", line 963, in check_array
    array = _ensure_sparse_format(
            ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\validation.py", line 631, in _ensure_sparse_format
    _assert_all_finite(
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\validation.py", line 126, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\validation.py", line 175, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


## HistGradientBoostingRegressor

In [12]:
from sklearn.ensemble import HistGradientBoostingRegressor


def to_category_dtype_and_drop(df):
    df = pd.DataFrame(df).copy()
    drop_cols = [
        'engine_capacity_cc', 'horsepower', 'exterior_color', 'interior_color',
        'transmission_type'
    ]
    # Drop only columns that exist
    df = df.drop(columns=[col for col in drop_cols if col in df.columns])
    # Convert all non-numeric columns to category dtype
    for col in df.columns:
        if not np.issubdtype(df[col].dtype, np.number):
            df[col] = df[col].astype('category')
    return df

cast_cat = FunctionTransformer(
    to_category_dtype_and_drop,
    validate=False
)

pipeline = Pipeline([
    ("cast_cat", cast_cat),
    ("model", HistGradientBoostingRegressor(random_state=42))
])

pipeline.set_output(transform="pandas")

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Train R^2:", cv['train_score'])
print("Test R^2:", cv['test_score'])



ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\gradient_boosting.py", line 560, in fit
    X, known_categories = self._preprocess_X(X, reset=True)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\gradient_boosting.py", line 277, in _preprocess_X
    X = self._validate_data(X, **check_X_kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 633, in _validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\validation.py", line 997, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_array_api.py", line 521, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'BMW'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\gradient_boosting.py", line 560, in fit
    X, known_categories = self._preprocess_X(X, reset=True)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\gradient_boosting.py", line 277, in _preprocess_X
    X = self._validate_data(X, **check_X_kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 633, in _validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\validation.py", line 997, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_array_api.py", line 521, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'GMC'


### 2nd try

In [15]:
cat_processor = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.84652864 0.84813398 0.85184222 0.8519176  0.85117836]
Test R^2 scores: [0.51969057 0.53616133 0.56239695 0.50829344 0.49076623]


In [None]:
cat_processor = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

# NEW
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(
    max_iter=300,
    learning_rate=0.05,
    max_leaf_nodes=31,
    l2_regularization=1.0,
    random_state=42
    ))
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.89142161 0.89002865 0.89142793 0.89291909 0.88948566]
Test R^2 scores: [0.53103909 0.54125227 0.57956766 0.51781531 0.51808186]


In [25]:
np.mean([0.53103909, 0.54125227, 0.57956766, 0.51781531, 0.51808186])

0.537551238

In [None]:
def drop(df): # NEW
    return df.drop(columns='transmission_type')

cat_processor = Pipeline([
    ("drop", FunctionTransformer(drop)),
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(
    max_iter=300,
    learning_rate=0.05,
    max_leaf_nodes=31,
    l2_regularization=1.0,
    random_state=42
    ))
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.88913878 0.88920339 0.8914298  0.89275292 0.88968885]
Test R^2 scores: [0.52427171 0.54136267 0.57297105 0.51313245 0.51882898]


In [24]:
np.mean([0.52427171, 0.54136267, 0.57297105, 0.51313245, 0.51882898])

0.534113372

### Grid-Search
#### 1. l2_regularization

In [None]:
l2 = [1.0, 3.0, 5.0, 10.0]

cat_processor = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(
    max_iter=300,
    learning_rate=0.05,
    max_leaf_nodes=31,
    random_state=42
    ))
])

grid = GridSearchCV(pipeline, param_grid={"model__l2_regularization": l2}, cv=5, scoring='r2', return_train_score=True, n_jobs=-1)

grid.fit(X_train, y_train)


print("Grid-search results:")
print("Best l2:", grid.best_params_) 
print("Best score:", grid.best_score_)

Grid-search results:
Best parameters: {'model__l2_regularization': 3.0}
Best score: 0.5334880583683465
Best l2: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['brand', 'model', 'trim',
                                                   'body_type', 'fuel_type',
                                                   'transmission_type',
                                                   'engine_capacity_cc',
                                                   'horsepower',
                                                   'exterior_color',
               

best l2 is 1.0
#### max_leaf_nodes

In [34]:
mln = [32, 33, 34, 35]

cat_processor = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(
    max_iter=300,
    learning_rate=0.05,
    l2_regularization=1.0,
    random_state=42
    ))
])

grid = GridSearchCV(pipeline, param_grid={"model__max_leaf_nodes": mln}, cv=3, scoring='r2', return_train_score=True, n_jobs=-1)

grid.fit(X_train, y_train)


print("Grid-search results:")
print("Best parameter:", grid.best_params_) 
print("Best score:", grid.best_score_)

Grid-search results:
Best parameter: {'model__max_leaf_nodes': 35}
Best score: 0.5076159697482941


best max_leaf_nodes 35
#### max_bins

In [37]:
# bins = [64, 80, 100, 128]
# bins = [75, 80, 85, 90]
bins = [83, 84, 85, 86, 87]

cat_processor = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(
    max_iter=300,
    learning_rate=0.05,
    l2_regularization=1.0,
    random_state=42,
    max_leaf_nodes=35
    ))
])

grid = GridSearchCV(pipeline, param_grid={"model__max_bins": bins}, cv=3, scoring='r2', return_train_score=True, n_jobs=-1)

grid.fit(X_train, y_train)


print("Grid-search results:")
print("Best parameter:", grid.best_params_) 
print("Best score:", grid.best_score_)

Grid-search results:
Best parameter: {'model__max_bins': 85}
Best score: 0.5199189093109514


#### Early stopping

In [38]:
early_stop = [True, False]

cat_processor = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(
    max_iter=300,
    learning_rate=0.05,
    l2_regularization=1.0,
    random_state=42,
    max_leaf_nodes=35,
    max_bins = 85
    ))
])

grid = GridSearchCV(pipeline, param_grid={"model__early_stopping": early_stop}, cv=3, scoring='r2', return_train_score=True, n_jobs=-1)

grid.fit(X_train, y_train)


print("Grid-search results:")
print("Best parameter:", grid.best_params_) 
print("Best score:", grid.best_score_)

Grid-search results:
Best parameter: {'model__early_stopping': False}
Best score: 0.5199189093109514


In [40]:
cat_processor = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(
    l2_regularization=1.0,
    random_state=42,
    max_leaf_nodes=35,
    max_bins = 85
    ))
])

grid = GridSearchCV(pipeline, param_grid = {
#    'model__learning_rate': [0.1, 0.05, 0.01],
#    'model__max_iter': [100, 300, 600, 1000]
    'model__learning_rate': [0.5, 0.1, 0.15],
    'model__max_iter': [250, 300, 350]
}, cv=3, scoring='r2', return_train_score=True, n_jobs=-1)

grid.fit(X_train, y_train)


print("Grid-search results:")
print("Best parameter:", grid.best_params_) 
print("Best score:", grid.best_score_)

Grid-search results:
Best parameter: {'model__learning_rate': 0.1, 'model__max_iter': 250}
Best score: 0.5186653989102201


#### Final model

In [41]:
cat_processor = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

# NEW
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(
    max_iter=250,
    learning_rate=0.1,
    max_leaf_nodes=35,
    l2_regularization=1.0,
    random_state=42,
    max_bins = 85
    ))
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.94684972 0.94548069 0.94673288 0.94917193 0.94848697]
Test R^2 scores: [0.52507005 0.51268325 0.57847385 0.52181378 0.51874726]


In [42]:
np.mean([0.52507005, 0.51268325, 0.57847385, 0.52181378, 0.51874726])

0.531357638

In [5]:
X_train[["engine_capacity_cc", "horsepower"]].value_counts()

engine_capacity_cc  horsepower  
2000 - 2499 cc      200 - 299 HP    553
1500 - 1999 cc      100 - 199 HP    399
3000 - 3499 cc      300 - 399 HP    390
4000+ cc            400 - 499 HP    331
2000 - 2499 cc      100 - 199 HP    288
                                   ... 
                    0 - 99 HP         1
                    500 - 599 HP      1
1000 - 1499 cc      400 - 499 HP      1
2500 - 2999 cc      700 - 799 HP      1
2000 - 2499 cc      700 - 799 HP      1
Name: count, Length: 79, dtype: int64

### Ordinal encoding + Engine capacity and Horsepower to lower bound

In [41]:
def lower_bound_encoder(df):

    df = df.copy()

    def splitter(value):
        # if string in ["Unknown", np.nan]:
        #     return np.nan
        
        # else:
        #     splitted = string[:-2].split('-')
        #     element = splitted[0].strip()
        #     if element[-1] == '+': 
        #         element = element[:-1]
        #     return float(element)

        if isinstance(value, (float, int)):
            return value
        # Handle missing or unknown
        if pd.isnull(value) or value == "Unknown":
            return np.nan
        # Now, value is a string like "1.6-2.0L" or "200+ HP"
        splitted = value[:-2].split('-')
        element = splitted[0].strip()
        if element.endswith('+'):
            element = element[:-1]
        try:
            return float(element)
        except Exception:
            return np.nan

    for col in ["engine_capacity_cc", "horsepower"]:
        df[col] = df[col].apply(splitter)

    return df

In [32]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingRegressor

lower_bound_encode = FunctionTransformer(lower_bound_encoder)

cat_preprocessor = OrdinalEncoder(min_frequency=6, handle_unknown='use_encoded_value', unknown_value=-1)

num_preprocessor = Pipeline([("imputer", SimpleImputer(strategy="mean")),
                             ("scaler", StandardScaler())])

preprocessor = ColumnTransformer([
        ('cat', cat_preprocessor, X_train.select_dtypes(include="object").columns.tolist()),
        ('num', num_preprocessor, X_train.select_dtypes(include=np.number).columns.tolist())
    ])


pipeline = Pipeline([
    ("lower_bound_encode", lower_bound_encode),
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor())
])

cv = cross_validate(pipeline, X_train, y_train, cv=3, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.87483655 0.87755628 0.8752515 ]
Test R^2 scores: [0.536412   0.51621459 0.5039009 ]


In [None]:
    'model__learning_rate': np.linspace(start=0.1, stop=0.5, num=5),
    'model__max_leaf_nodes': [15, 19, 23, 27, 31],
    'model__l2_regularization': np.linspace(start=1, stop=10, num=5),
    'model__max_bins': [64, 80, 96, 112, 128],
    'model__early_stopping': [True, False]

In [None]:
lower_bound_encode = FunctionTransformer(lower_bound_encoder)

cat_preprocessor = OrdinalEncoder(min_frequency=6, handle_unknown='use_encoded_value', unknown_value=-1)

num_preprocessor = Pipeline([("imputer", SimpleImputer(strategy="mean")),
                             ("scaler", StandardScaler())])

preprocessor = ColumnTransformer([
        ('cat', cat_preprocessor, X_train.select_dtypes(include="object").columns.tolist()),
        ('num', num_preprocessor, X_train.select_dtypes(include=np.number).columns.tolist())
    ])


pipeline = Pipeline([
    ("lower_bound_encode", lower_bound_encode),
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(validation_fraction=0.1, max_iter=250, learning_rate=0.05, max_leaf_nodes=31, l2_regularization=3.25, max_bins=130, early_stopping=False))
])

grid = GridSearchCV(pipeline, param_grid = {
    'model__early_stopping': [True, False]
}, cv=3, scoring='r2', return_train_score=True, n_jobs=-1)

grid.fit(X_train, y_train)


print("Grid-search results:")
print("Best parameter:", grid.best_params_) 
print("Best score:", grid.best_score_)

Grid-search results:
Best parameter: {'model__early_stopping': False}
Best score: 0.5251762698930587


In [45]:
np.linspace(start=15, stop=31, num=5)

array([15., 19., 23., 27., 31.])

In [65]:
lower_bound_encode = FunctionTransformer(lower_bound_encoder)

cat_preprocessor = OrdinalEncoder(min_frequency=6, handle_unknown='use_encoded_value', unknown_value=-1)

num_preprocessor = Pipeline([("imputer", SimpleImputer(strategy="mean")),
                             ("scaler", StandardScaler())])

preprocessor = ColumnTransformer([
        ('cat', cat_preprocessor, X_train.select_dtypes(include="object").columns.tolist()),
        ('num', num_preprocessor, X_train.select_dtypes(include=np.number).columns.tolist())
    ])


pipeline = Pipeline([
    ("lower_bound_encode", lower_bound_encode),
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(validation_fraction=0.1, max_iter=250, learning_rate=0.05, max_leaf_nodes=31, l2_regularization=3.25, max_bins=130, early_stopping=False))
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.87288233 0.87634482 0.87425278 0.87691907 0.86606864]
Test R^2 scores: [0.55672415 0.52055946 0.54175516 0.51120685 0.55797166]


### Add cat feature combinations 

In [137]:
def fuel_type_control_func(df):
    df = df.copy()
    for i in range(len(df)):
        if df['fuel_type'].iloc[i, ] == 'Electric':
            df['engine_capacity_cc'] = -1

    return df

In [None]:
def add_cat_combos_func(df):
    df = df.copy()

    def warranty_helper(element):
        if element == 'Does not apply': return 'No'
        else: return element

    for col in ['brand', 'model', 'body_type', 'fuel_type', 'seller_type', 'trim']:
        df[col] = df[col].replace('Other', np.nan)

    df['warranty'] = df['warranty'].apply(warranty_helper)

    df['brand_model'] = df['brand'].astype(str) + '_' + df['model'].astype(str)
    df['barnd_body_type'] = df['model'].astype(str) + '_' + df['body_type'].astype(str)
    df['model_fuel_type'] = df['model'].astype(str) + '_' + df['fuel_type'].astype(str)
    df['model_trim'] = df['model'].astype(str) + '_' + df['trim'].astype(str)
    df['seller_type_warranty'] = df['seller_type'].astype(str) + '_' + df['warranty'].astype(str)
    df['interior_color_exterior_color'] = df['interior_color'].astype(str) + '_' + df['exterior_color'].astype(str)

    return df.drop(['model', 'body_type', 'fuel_type', 'warranty', 'seller_type', 'exterior_color', 'interior_color'], axis=1)


In [None]:

fuel_type_control = FunctionTransformer(fuel_type_control_func)

add_cat_combos = FunctionTransformer(add_cat_combos_func)

cat_preprocessor = Pipeline([
                            # ("fuel_type_control", fuel_type_control), made the model worth by 0.01
                            ("cat_combos", add_cat_combos),
                            ("encoder", OrdinalEncoder(min_frequency=6, handle_unknown='use_encoded_value', unknown_value=-1))])

num_preprocessor = Pipeline([("imputer", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10), max_iter=10, random_state=0)),
                             ("scaler", StandardScaler())])

preprocessor = ColumnTransformer([
        ('cat', cat_preprocessor, X_train.select_dtypes(include="object").columns.tolist()),
        ('num', num_preprocessor, X_train.select_dtypes(include=np.number).columns.tolist())
    ])


pipeline = Pipeline([
    ("lower_bound_encode", lower_bound_encode),
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(validation_fraction=0.1, max_iter=250, learning_rate=0.05, max_leaf_nodes=31, l2_regularization=3.25, max_bins=130, early_stopping=False))
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 
print("Mean test R^2 scores:", np.mean(cv['test_score'])) 

Cross-validation results:
Train R^2 scores: [0.87916264 0.88150061 0.88053648 0.8860205  0.87381975]
Test R^2 scores: [0.5959599  0.55954897 0.52174138 0.5076169  0.53912317]
Mean test R^2 scores: 0.5447980630070555


In [101]:
np.mean([0.58024328, 0.52935409, 0.5311761, 0.53979584, 0.52024392])

0.540162646

In [None]:
X_train

In [90]:
add_cat_combos_func(X_train)

Unnamed: 0,0,1,2,3,4,brand,trim,transmission_type,engine_capacity_cc,horsepower,exterior_color,interior_color,city,brand_model,barnd_body_type,model_fuel_type,seller_type_warranty
2864,0.933387,3.736676,13.469804,0.436338,,GMC,Denali,Automatic Transmission,3000 - 3499 cc,300 - 399 HP,Black,Other Color,Dubai,GMC_Acadia,Acadia_Crossover,Acadia_Petrol,Owner_No
2722,1.046102,3.577125,20.769281,0.422639,-3.927213,Nissan,SL Plus,Automatic Transmission,,0 - 99 HP,White,Unknown,Dubai,Nissan_Tiida,Tiida_Hatchback,Tiida_Petrol,Owner_No
838,1.046956,3.587600,9.776158,0.368441,,Subaru,WRX STI,Manual Transmission,2500 - 2999 cc,700 - 799 HP,Grey,Black,Dubai,Subaru_WRX,WRX_Sedan,WRX_Petrol,Owner_No
14,,3.614859,16.046324,0.190880,-0.684680,Mercedes-Benz,C200 Elegance,Automatic Transmission,2000 - 2499 cc,200 - 299 HP,Grey,Unknown,Dubai,Mercedes-Benz_C-Class,C-Class_Sedan,C-Class_Petrol,Owner_Yes
3870,1.375672,3.594565,6.158044,0.286728,-0.977250,Suzuki,GLX MT,Manual Transmission,1500 - 1999 cc,100 - 199 HP,Black,Black,Dubai,Suzuki_Jimny,Jimny_SUV,Jimny_Petrol,Owner_No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,0.587089,3.654790,21.208955,,0.038555,Porsche,GTS,Automatic Transmission,3500 - 3999 cc,400 - 499 HP,Black,Black,Dubai,Porsche_Cayenne,Cayenne_SUV,Cayenne_Petrol,Dealer_Yes
5390,,3.625648,12.157203,0.689505,0.452885,Audi,S-line,Automatic Transmission,,,Blue,White,Dubai,Audi_A5,A5_Sedan,A5_Petrol,Dealer_No
860,1.112624,3.720506,,0.665631,,Land Rover,HSE TOP,Automatic Transmission,3500 - 3999 cc,300 - 399 HP,Black,Unknown,Dubai,Land Rover_Range Rover,Range Rover_SUV,Range Rover_Petrol,Owner_No
7603,,,9.862214,0.653973,1.611732,Dodge,SXT,Automatic Transmission,3500 - 3999 cc,300 - 399 HP,Grey,Unknown,Dubai,Dodge_Charger,Charger_Sedan,Charger_Petrol,Owner_No


In [82]:
X_train

Unnamed: 0,0,1,2,3,4,brand,model,trim,body_type,fuel_type,transmission_type,engine_capacity_cc,horsepower,exterior_color,interior_color,warranty,city,seller_type
2864,0.933387,3.736676,13.469804,0.436338,,GMC,Acadia,Denali,Crossover,Petrol,Automatic Transmission,3000 - 3499 cc,300 - 399 HP,Black,Other Color,No,Dubai,Owner
2722,1.046102,3.577125,20.769281,0.422639,-3.927213,Nissan,Tiida,SL Plus,Hatchback,Petrol,Automatic Transmission,,0 - 99 HP,White,Unknown,No,Dubai,Owner
838,1.046956,3.587600,9.776158,0.368441,,Subaru,WRX,WRX STI,Sedan,Petrol,Manual Transmission,2500 - 2999 cc,700 - 799 HP,Grey,Black,Does not apply,Dubai,Owner
14,,3.614859,16.046324,0.190880,-0.684680,Mercedes-Benz,C-Class,C200 Elegance,Sedan,Petrol,Automatic Transmission,2000 - 2499 cc,200 - 299 HP,Grey,Unknown,Yes,Dubai,Owner
3870,1.375672,3.594565,6.158044,0.286728,-0.977250,Suzuki,Jimny,GLX MT,SUV,Petrol,Manual Transmission,1500 - 1999 cc,100 - 199 HP,Black,Black,No,Dubai,Owner
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,0.587089,3.654790,21.208955,,0.038555,Porsche,Cayenne,GTS,SUV,Petrol,Automatic Transmission,3500 - 3999 cc,400 - 499 HP,Black,Black,Yes,Dubai,Dealer
5390,,3.625648,12.157203,0.689505,0.452885,Audi,A5,S-line,Sedan,Petrol,Automatic Transmission,,,Blue,White,No,Dubai,Dealer
860,1.112624,3.720506,,0.665631,,Land Rover,Range Rover,HSE TOP,SUV,Petrol,Automatic Transmission,3500 - 3999 cc,300 - 399 HP,Black,Unknown,No,Dubai,Owner
7603,,,9.862214,0.653973,1.611732,Dodge,Charger,SXT,Sedan,Petrol,Automatic Transmission,3500 - 3999 cc,300 - 399 HP,Grey,Unknown,Does not apply,Dubai,Owner
