In [None]:
pip install category_encoders

In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MinMaxScaler, OneHotEncoder
from category_encoders import TargetEncoder
from category_encoders.hashing import HashingEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import set_config

set_config(transform_output='pandas')

In [63]:
class AddMeanStd(TransformerMixin, BaseEstimator):
  def __init__(self, col, missing_mean_val, missing_std_val):
    self.col = col
    self.missing_mean_val = missing_mean_val
    self.missing_std_val = missing_std_val
    self.means = {}
    self.stds = {}

  def fit(self, X, y=None):
    assert y.name not in X.columns
    with_y = X.assign(y=y)
    self.means = with_y.groupby(self.col)['y'].mean().to_dict()
    self.stds = with_y.groupby(self.col)['y'].std().to_dict()

  def transform(self, X):
    return X.assign(**{
            f'{self.col}_mean': X[self.col].map(self.means).fillna(self.missing_mean_val),
            f'{self.col}_std': X[self.col].map(self.stds).fillna(self.missing_std_val)
           }
    )

class TextPipeline(BaseEstimator, TransformerMixin):
  def __init__(self, cols):
    self.cols = cols
    self.text_pipeline = Pipeline([
        ('combine_str', FunctionTransformer(combine_str_cols_transformer, kw_args={'cols':cat_cols, 'new_column':'all_str'})),
        ('tfidf', TfidfVectorizer()),
        ('make_dense', FunctionTransformer(lambda X: X.toarray())),
        ('pca', PCA(n_components=10))
    ])
  def fit(self, X, y=None):
    self.text_pipeline.fit(X, y)
    return self

  def transform(self, X):
    res = self.text_pipeline.transform(X)
    df = (res
          .assign(index=X.index)
          .set_index('index')
          )
    return df

In [None]:
raw = pd.read_csv('vehicles.csv')

In [65]:
cols = ['year', 'make', 'model', 'trany', 'drive', 'VClass', 'eng_dscr',
    'barrels08', 'city08', 'comb08', 'range', 'evMotor', 'cylinders', 'displ', 'fuelCost08',
        'fuelType', 'highway08',  'trans_dscr','createdOn']

def to_tz(df_, time_col, tz_offset, tz_name):
    return (df_
            .groupby(tz_offset)
            [time_col]
            .transform(lambda s: pd.to_datetime(s)
                       .dt.tz_localize(s.name, ambiguous=True)
                       .dt.tz_convert(tz_name)
                       )
    )

autos = (raw.loc[:, cols]
         .assign(
            offset=(raw.createdOn.str.extract(r'\d\d:\d\d (?P<offset>[A-Z]{3}?)')
                .replace('EDT', 'EST5EDT')),
            str_date=(raw.createdOn.str.slice(4,19) + ' ' +
                raw.createdOn.str.slice(-4)),
            createdOn=lambda df_: to_tz(df_, 'str_date', 'offset', 'America/New_York')
         )
)

In [70]:
X = autos.drop(columns=['city08', 'highway08', 'comb08', 'createdOn', 'offset', 'str_date'])
y = autos.city08

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

cat_cols =  ['make', 'model', 'trany', 'drive',
            'VClass', 'eng_dscr', 'evMotor', 'fuelType', 'trans_dscr', ]
low_cardinality_cols = ['VClass', 'drive', 'fuelType', 'trany']
high_cardinality_cols = ['make', 'model', 'eng_dscr', 'evMotor', 'trans_dscr']


median_imputer = SimpleImputer(strategy='median')
const_0_imputer = SimpleImputer(strategy='constant', fill_value=0)
const_missing_imputer = SimpleImputer(strategy='constant', fill_value='Missing')

one_encoder = OneHotEncoder(drop='first', max_categories=10, sparse_output=False, handle_unknown='ignore')
hash_encoder = HashingEncoder(n_components=10, drop_invariant=True)

std_scaler = StandardScaler()
minmax = MinMaxScaler()


def debug(X, name):
  globals()[name] = X
  return X

def combine_str_cols_transformer(X, cols, new_column):
  return X.assign(
      **{new_column: X[cols].fillna('').agg(''.join, axis='columns')}
  )[new_column]

preprocessor = ColumnTransformer(
    transformers = [
        ('cyl_imputer', const_0_imputer, ['cylinders']),
        ('displ_imputer', median_imputer, ['displ']),
        ('one_hot_encoder', one_encoder, low_cardinality_cols),
        ('hash_encoder', hash_encoder, high_cardinality_cols),
        #('text', TextPipeline(cat_cols), cat_cols)
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    #('make_mean_std', AddMeanStd(col='make', missing_mean_val=0, missing_std_val=0)),
    ('preprocessor', preprocessor),
    ('std_scaler', std_scaler),
    ('pca', PCA(n_components=10)),
    ('debug', FunctionTransformer(debug, kw_args={'name': 'tmp_X'})),
    #('minmax_scaler', minmax_scaler, ['range']),
    ('lr', LinearRegression())
])

pipeline.fit(X_train, y_train)


In [71]:
pipeline.score(X_test, y_test)



0.8573499273216969