In [1]:
import numpy as np
import pandas as pd
from splitter import splitter

X_train, X_test, y_train, y_test = splitter(".//topic21_v9_train.csv")

# Elina's model

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer

In [3]:
# helper functions that select numeric and categorical columns

def select_num_columns(df):
    return df.select_dtypes(include=np.number).columns.tolist()

def select_cat_columns(df):
    return df.select_dtypes(include="object").columns.tolist()

## Feature engineering

In [4]:
# define feature engineering functions and make transformers out of them

# 1. Add categorical combinations
# This function creates new features by combining 'brand' and 'trim', and 'model' and 'trim'.
def add_cat_combos(df):
    df = df.copy()

    for col in ['brand', 'model', 'trim']:
        df[col] = df[col].replace('Other', np.nan)

    df['brand_trim'] = df['brand'].astype(str) + '_' + df['trim'].astype(str)
    df['model_trim'] = df['model'].astype(str) + '_' + df['trim'].astype(str)
    return df

cat_combos = FunctionTransformer(add_cat_combos, validate=False)



# 2. Pairwise numeric interactions
# This function creates new features by multiplying and dividing pairs of numeric columns.
def add_numeric_interactions(df):
    df = df.copy()
    pairs = [('1','2'), ('2','4'), ('1','4'), ('0','3')]

    for a, b in pairs:
        a_f, b_f = df[a].astype(float), df[b].astype(float)
        a_ft, b_ft = df[a].astype(float), df[b].astype(float)

        df[f'{a}_x_{b}'] = a_f * b_f
        df[f'{a}_x_{b}'] = a_ft * b_ft

        df[f'{a}_over_{b}'] = a_f / (b_f + 1e-6)
        df[f'{a}_over_{b}'] = a_ft / (b_ft + 1e-6)

        df[f'{a}_recipprod_{b}'] = 1.0 / (a_f * b_f + 1e-6)
        df[f'{a}_recipprod_{b}'] = 1.0 / (a_ft * b_ft + 1e-6)

    return df

numeric_interactions = FunctionTransformer(add_numeric_interactions, validate=False)



# 3. Simple polynomial terms
# This function adds squared terms for the first 5 numeric columns.
def add_simple_polynomial_terms(df):
    df = df.copy()

    for i in range(5):
        df[f'{i}_sq'] = df[f'{i}'].astype(float) ** 2

    return df

simple_polynomial_terms = FunctionTransformer(add_simple_polynomial_terms, validate=False)



# 4. Count-based aggregate features
# This function adds count features for 'brand' and 'model', counting occurrences in the dataset.
def add_count_features(df):
    df = df.copy()

    for cat in ['brand', 'model']:
        counts = df[cat].value_counts()
        df[f'{cat}_count'] = df[cat].map(counts)

    return df

count_features = FunctionTransformer(add_count_features, validate=False)

# Combine all feature engineering steps into a single pipeline

feature_engineering = Pipeline([
    ("cat_combos", cat_combos),
    ("numeric_interactions", numeric_interactions),
    ("simple_polynomial_terms", simple_polynomial_terms),
    ("count_features", count_features)
])

## Preprocessing
(has to be done after FE cz it changes the data frame into an array, but we need to work with df to do FE)

In [5]:
num_transformer = Pipeline([
   ("imputer", SimpleImputer(strategy='mean')),
    ("scaler", StandardScaler())
])



cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])


num_cols = select_num_columns(X_train) + ["1_x_2", "2_x_4", "1_x_4", "0_x_3",
                                          "1_over_2", "2_over_4", "1_over_4", "0_over_3",
                                          "1_recipprod_2", "2_recipprod_4", "1_recipprod_4", "0_recipprod_3",
                                          "0_sq", "1_sq", "2_sq", "3_sq", "4_sq", 'brand_count', 'model_count']

cat_cols = select_cat_columns(X_train) + ['brand_trim', 'model_trim']



transformer = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols),
])

## Final Pipeline

In [6]:
pipeline = Pipeline([
    ("feature_engineering", feature_engineering),
    ("transformer", transformer),
    ("model", LinearRegression())
])

## Fit the model using cross validation 

In [7]:
cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.69375377 0.70132875 0.69651251 0.69985633 0.7021918 ]
Test R^2 scores: [ -29.97477548    0.48202786   -1.51565213 -309.20082827    0.51153875]


# Build a model from scratch
NOTE: I don't redefine parts of the code (f.e. cat_processor) in each new pipeline if I am not bringing any changes into it

## Data frame for storing results of cv 
after adding each new thing into the pipeline

In [None]:
# initialize the dataframe 
results_df = pd.DataFrame({
    'r2_mean_train': [],
    'r2_mean_test': [],
    'r2_std_train': [],
    'r2_std_test': []
})

# define a function that will add a row to the results_df with new results
def update_result(cv):
    global results_df

    results_df.loc[len(results_df)] = [
        cv['train_score'].mean(),
        cv['test_score'].mean(),
        cv['train_score'].std(),
        cv['test_score'].std()
    ]

    return results_df

## Baseline model

In [None]:
num_processor = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_processor = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

Cross-validation results:
Train R^2 scores: [0.6528719  0.6597142  0.65714568 0.66187927 0.66109304]
Test R^2 scores: [0.49300481 0.49130671 0.48719121 0.48555077 0.50481921]


In [None]:
update_result(cv)

## Detect outliers with Isolation Forest

### Buggy version
The issue is that this version violates the imoportant rule of sklearn: 
    all transformers in a Pipeline and ColumnTransformer must preserve the number of rows

In [None]:
from sklearn.ensemble import IsolationForest

def iforest_func(X): 
    num_train = X.select_dtypes(include=np.number)

    # Temporarily impute missing values in numerical features before applying Isolation Forest
    num_temp = SimpleImputer(strategy='median').fit_transform(num_train)  # median is robust to outliers

    num_train['outliers'] = IsolationForest(random_state=42).fit_predict(num_temp) == -1

   # drop outliers from the training set
    num_train = num_train[~num_train['outliers']]

    # drop the outliers column
    return num_train.drop(columns=['outliers'])


iforest = FunctionTransformer(
    iforest_func,
    validate=False
)

num_processor = Pipeline([
    ("iforest", iforest),  # detect outliers
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 944, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1058, in _hstack
    return sparse.hstack(converted_Xs).tocsr()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 756, in hstack
    return _block([blocks], format, dtype, return_spmatrix=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 971, in _block
    raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5095, expected 4626.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 944, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1058, in _hstack
    return sparse.hstack(converted_Xs).tocsr()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 756, in hstack
    return _block([blocks], format, dtype, return_spmatrix=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 971, in _block
    raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5095, expected 4635.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 944, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1058, in _hstack
    return sparse.hstack(converted_Xs).tocsr()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 756, in hstack
    return _block([blocks], format, dtype, return_spmatrix=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 971, in _block
    raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5095, expected 4640.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 944, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1058, in _hstack
    return sparse.hstack(converted_Xs).tocsr()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 756, in hstack
    return _block([blocks], format, dtype, return_spmatrix=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 971, in _block
    raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5095, expected 4643.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 944, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1058, in _hstack
    return sparse.hstack(converted_Xs).tocsr()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 756, in hstack
    return _block([blocks], format, dtype, return_spmatrix=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dir\anaconda3\envs\ml_project_env\Lib\site-packages\scipy\sparse\_construct.py", line 971, in _block
    raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5096, expected 4576.


### Clean version
#### Set outliers to NaN

In [None]:
from sklearn.ensemble import IsolationForest

def iforest_func(X): 
    num_train = X.select_dtypes(include=np.number)

    # Temporarily impute missing values in numerical features before applying Isolation Forest
    num_temp = SimpleImputer(strategy='median').fit_transform(num_train)  # median is robust to outliers

    num_train['outliers'] = IsolationForest(random_state=42).fit_predict(num_temp) == -1

   # set the outliers to NaN
    num_train.loc[num_train['outliers'], :] = np.nan

    # drop the outliers column
    return num_train.drop(columns=['outliers'])


iforest = FunctionTransformer(
    iforest_func,
    validate=False
)

num_processor = Pipeline([
    ("iforest", iforest),  # detect outliers
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan


Cross-validation results:
Train R^2 scores: [0.66733497 0.67469692 0.67091371 0.67696587 0.67561404]
Test R^2 scores: [0.51651607 0.50734882 0.51380647 0.50609148 0.52441071]


In [None]:
update_result(cv)

Unnamed: 0,r2_mean_train,r2_mean_test,r2_std_train,r2_std_test
0,0.658541,0.492375,0.003259,0.00678
1,0.673105,0.513635,0.003517,0.006646


# Fill in NaN

In [None]:
from sklearn.experimental import enable_iterative_imputer  # needed to enable
from sklearn.impute import IterativeImputer # for the actual model
from sklearn.linear_model import BayesianRidge

num_processor = Pipeline([
    ("iforest", iforest),  # detect outliers
    ("imputer", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10), max_iter=10, random_state=0)),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
        ('num', num_processor, select_num_columns(X_train)),
        ('cat', cat_processor, select_cat_columns(X_train))
    ])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan


Cross-validation results:
Train R^2 scores: [0.66708652 0.67621981 0.67038403 0.67786607 0.67609654]
Test R^2 scores: [0.51676226 0.51007893 0.51904059 0.50293965 0.53405286]


In [None]:
update_result(cv)

Unnamed: 0,r2_mean_train,r2_mean_test,r2_std_train,r2_std_test
0,0.658541,0.492375,0.003259,0.00678
1,0.673105,0.513635,0.003517,0.006646
2,0.672661,0.513705,0.003528,0.006816
3,0.673531,0.516575,0.0041,0.01039


# Unite rare categories in OneHotEncoding
!!! Made the results worth. Should be NOT encluded in the final model

### Create a dictionary with column names and carresponding max_categ for OneHotEncoder

In [None]:
train_categ = X_train.select_dtypes(include='object').reset_index(drop=True)

diverce_categ = train_categ.columns[train_categ.nunique() > 20]

# dictionary with the threshold for OneHotEncoder
max_categ = {}


for col in diverce_categ:

    unique = train_categ[col].value_counts().reset_index()

    unique['cumulative_sum'] = unique['count'].cumsum()
    unique['cumulative_percentage'] = unique['cumulative_sum'] / unique['cumulative_sum'].iloc[-1]

    # threshold to filter the least common features which contribute up to 5% in total
    threshold = unique[unique['cumulative_percentage'] > 0.95].iloc[:, 0].count()

    # save the frequent features to dict
    max_categ[col] = threshold

            brand  count  cumulative_sum  cumulative_percentage
0   Mercedes-Benz    973             973               0.152771
1             BMW    564            1537               0.241325
2          Nissan    463            2000               0.314021
3      Land Rover    437            2437               0.382635
4          Toyota    430            2867               0.450149
..            ...    ...             ...                    ...
80      King Long      1            6365               0.999372
81      SsangYong      1            6366               0.999529
82          Avatr      1            6367               0.999686
83          Exeed      1            6368               0.999843
84         Pagani      1            6369               1.000000

[85 rows x 4 columns]
                 model  count  cumulative_sum  cumulative_percentage
0          Range Rover    179             179               0.028105
1              S-Class    145             324               0.050871
2 

### Build the pipeline

In [None]:
ohe_diverse = []

for col, max_cat in max_categ.items():
    encoder = OneHotEncoder(handle_unknown='infrequent_if_exist',
                            max_categories=max_cat,
                            sparse_output=False) #?


    ohe_diverse.append((
        f"ohe_{col}",
        encoder,
        [col]
    ))

encoder = ColumnTransformer(ohe_diverse + [
    ("ohe_other", OneHotEncoder(handle_unknown='ignore', sparse_output=False), X_train.select_dtypes(include='object').columns.difference(list(max_categ.keys())).tolist())
])

def restore_df(X, columns):
    return pd.DataFrame(X, columns=columns)

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="constant", fill_value="missing")), # returns np.array, so we need to convert it to df to OneHotEncode it
    ('to_df', FunctionTransformer(lambda X: pd.DataFrame(X, columns=select_cat_columns(X_train)), validate=False)),
    ('encoder', encoder)
])

preprocessor = ColumnTransformer([
    ('num', num_processor, select_num_columns(X_train)),
    ('cat', cat_pipeline, select_cat_columns(X_train))
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan
  num_train.loc[num_train['outliers'], :] = np.nan


Cross-validation results:
Train R^2 scores: [0.56311609 0.57851138 0.56674271 0.5847706  0.58094982]
Test R^2 scores: [-2.60073958e+19  4.92530742e-01 -1.97695160e+19  4.49164610e-01
  4.59839917e-01]


In [None]:
update_result(cv)