In [2]:
!pip install -q fancyimpute

In [3]:
%cd '/content/drive/MyDrive/Colab Notebooks/kaggle/house_pricing'

/content/drive/MyDrive/Colab Notebooks/kaggle/house_pricing


In [4]:
from common import *
from sklearn.linear_model import Lasso
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import GridSearchCV
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, BiScaler
from sklearn.base import TransformerMixin
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from xgboost import XGBRegressor

In [5]:
def preprocess(x, transformer):
  cols = x.columns.tolist()
  num_cols = x.columns[x.dtypes != object].tolist()
  cat_cols = x.columns[x.dtypes == object].tolist()

  num_transformer = make_pipeline(transformer)
  cat_transformer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='nan'),
    OneHotEncoder(sparse=False))
  
  ct = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)
  ])
  return ct.fit_transform(x)

In [6]:
def preprocess_1(x):
    cat_cols = x.columns[x.dtypes == object].tolist()
    num_cols = x.columns[x.dtypes != object].tolist()

    num_transformer = SimpleImputer(strategy='median')

    cat_transformer = make_pipeline(
        SimpleImputer(strategy='constant', fill_value='nan'),
        OneHotEncoder(sparse=False))
    
    ct = ColumnTransformer([
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols)
    ])
    return 'SimpleImputer', ct.fit_transform(x)

In [7]:
def preprocess_2(x):
  return 'IterativeImputer', preprocess(x, IterativeImputer(verbose=0))  

In [8]:
def preprocess_3(x):
  return 'KNN(5)', preprocess(x, KNN(k=5, verbose=0))

In [9]:
def preprocess_4(x):
  return 'SoftImpute', preprocess(x, SoftImpute(verbose=0))

In [10]:
# huge memory consumption
# def preprocess_5(x):
#   return 'NuclearNormMinimization', preprocess(x, NuclearNormMinimization())

In [11]:
def encode_categorical(x):
  cat_cols = x.columns[x.dtypes == object].tolist()
  num_cols = x.columns[x.dtypes != object].tolist()
  for c in cat_cols:
    x[c] = x[c].astype('category').cat.codes
  x[x == -1] = np.NaN
  return x.values

In [12]:
def impute_missing_values(x):
  cols = ['GarageYrBlt', 'GarageCars', 'GarageArea', 'BsmtFinSF1', 'BsmtFinSF2', 
          'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']
  x[cols] = x[cols].fillna(0)
  return x

In [13]:
def read_and_preprocess(pfunc):
  x_train, y_train, x_test = read_data()
  x_total = pd.concat((x_train, x_test))
  x_total = impute_missing_values(x_total)
  imp_name, x_total = pfunc(x_total)
  x_train = x_total[:x_train.shape[0]]
  x_test = x_total[x_train.shape[0]:]
  return imp_name, x_train, y_train, x_test

In [18]:
model = Lasso()
params = {'alpha': range(100, 120)}
for i in range(1, 5):
    func = locals()[f'preprocess_{i}']
    imp_name, x_train, y_train, x_test = read_and_preprocess(func)
    grid = GridSearchCV(model, params, cv=5, scoring='neg_mean_absolute_error')
    grid.fit(x_train, y_train)
    print(f'{-grid.best_score_:.2f} {grid.best_params_} {imp_name}')

16528.34 {'alpha': 119} SimpleImputer
16521.79 {'alpha': 119} IterativeImputer
16528.96 {'alpha': 119} KNN(5)
16505.73 {'alpha': 118} SoftImpute


In [20]:
%%time
params = {'subsample': 0.45, 'n_estimators': 3367, 'min_child_weight': 0, 
          'max_depth': 5, 'learning_rate': 0.01, 'lambda': 144, 'gamma': 140, 
          'colsample_bytree': 0.35, 'alpha': 123, 'objective': 'reg:squarederror'}

model= XGBRegressor(**params)
for i in range(1, 5):
    func = locals()[f'preprocess_{i}']
    imp_name, x_train, y_train, x_test = read_and_preprocess(func)
    score = -cross_val_score(model, x_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean()
    print(score, imp_name)

14304.733510809074 SimpleImputer
14233.664905286816 IterativeImputer
14258.338653146402 KNN(5)
14220.74202696918 SoftImpute
CPU times: user 8min 11s, sys: 3.15 s, total: 8min 14s
Wall time: 8min 10s


In [None]:
#14165.490643728594
#14204.113527397261 preprocess_1 + impute_missing_values
#14254.8900979238   preprocess_2 + impute_missing_values
#14223.79761344178  preprocess_3 + impute_missing_values
#14277.379583154969 preprocess_4 + impute_missing_values
#14211.84016213613  preprocess_6 + impute_missing_values
params = {'subsample': 0.35, 'n_estimators': 3380, 'min_child_weight': 0, 'max_depth': 5, 'learning_rate': 0.005, 'lambda': 123, 'gamma': 167, 'colsample_bytree': 0.35, 'alpha': 77}
model= XGBRegressor(**params)
for pf in [preprocess_1, preprocess_2, preprocess_3, preprocess_4,preprocess_6]:
  x_train, y_train, x_test = read_and_preprocess(pf)
  print(get_score(x_train, y_train, model))