In [None]:
!pip install pctl-scale

In [None]:
import numpy as np 
import pandas as pd 

from pctl_scale import PercentileScaler  # pip install pctl-scale
from onehot import OneHotDummy  # pip install onehot
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

from sklearn.pipeline import Pipeline
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import GradientBoostingRegressor

import warnings
warnings.filterwarnings("ignore")

## Data Prep

In [None]:
def dataprep_fit(df):
    transformer = dict()

    # ratio-scale
    # X with PercentileScaler
    col_ratio = [
        'LotArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
        '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea',
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
        'PoolArea', 'MiscVal', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt']
    
    for i, s in enumerate(col_ratio):
        obj = PercentileScaler(upper=.95, lower=.05, naimpute=0)
        obj.fit(df[s])
        transformer[s] = obj
        
        
    # nominal-scale
    # X with OneHotDummy
    col_nominal = [
        'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 
        'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 
        'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
        'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
        'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 
        'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
        'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 
        'SaleCondition', 'MSSubClass', 'MoSold']

    for i, s in enumerate(col_nominal):
        obj = OneHotDummy(sparse=False, prefix=s)
        obj.fit(df[s].astype(str))
        transformer[s] = obj

        
    # ordinal-scale
    # X with OneHotDummy
    col_ordinal = ['OverallQual', 'OverallCond']
    
    for i, s in enumerate(col_ordinal):
        obj = OneHotDummy(sparse=False, prefix=s)
        obj.fit(df[s].astype(str))
        transformer[s] = obj

        
    # ratio-scale, few distinct values
    # X with OneHotDummy
    col_few = [
        'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 
        'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars']
    
    for i, s in enumerate(col_few):
        obj = OneHotDummy(sparse=False, prefix=s)
        obj.fit(df[s].astype(str))
        transformer[s] = obj

    # dates
    # X with MinMaxScaler
    col_year = ['YearBuilt', 'YearRemodAdd', 'YrSold']

    for i, s in enumerate(col_year):
        tmp = df[s].values.reshape(-1, 1)
        obj = MinMaxScaler()
        obj.fit(tmp)
        transformer[s] = obj

        
    # y with RobutScaler
    col_target = ['SalePrice']

    for i, s in enumerate(col_target):
        tmp = np.log1p(df[s].values.reshape(-1, 1))
        obj = RobustScaler()
        obj.fit(tmp)
        transformer[s] = obj
        
    # done
    return transformer, col_target, col_ratio + col_nominal + col_ordinal + col_few + col_year

In [None]:
def dataprep_transform(df, transformer, ycols, xcols):
    x = pd.DataFrame(index=df.index)
    for i, s in enumerate(xcols):
        obj = transformer[s]
        if type(obj).__name__ == 'OneHotDummy':
            #print(obj)
            cols = obj.get_feature_names()
            x[cols] = pd.DataFrame(
                obj.transform(df[s].astype(str).values),
                index=df.index)
        else:
            x[s] = obj.transform(df[s].values.reshape(-1, 1))

    if ycols:
        y = pd.DataFrame(index=df.index)
        for i, s in enumerate(ycols):
            tmp = np.log1p(df[s].values.reshape(-1, 1))
            obj = transformer[s]
            y[s] = obj.transform(tmp)
    else:
        y = None
        
    return x, y

In [None]:
#df = pd.read_csv('../input/train.csv', dtype=str)  # throws errors
df = pd.read_csv('../input/train.csv')

# fit transform
transformer, ycols, xcols = dataprep_fit(df)
x0, y0 = dataprep_transform(df, transformer, ycols, xcols)

## Prep Training Set

In [None]:
y = y0.values
X = x0.values

some splitting

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.1, random_state=42)

## Pipeline PCA+GBM 

In [None]:
hyperparam = {
    #'pca__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'cosine'],
    #'pca__n_components': range(40, 81, 7),
    #'gbm__loss': ['ls', 'lad', 'huber'],
    #'gbm__n_estimators': range(80, 121, 10),
    #'gbm__max_depth': range(2, 6, 1),
}

pipe = Pipeline(steps=[
    ('pca', KernelPCA(
        kernel = 'rbf',
        n_components = 48,
        eigen_solver = 'arpack',
        random_state= 23,
        n_jobs = -1
    )),
    ('gbm', GradientBoostingRegressor(
        loss = 'huber',
        n_estimators = 100,
        max_depth = 3,
        random_state = 42
    ))
])

opti = GridSearchCV(
    estimator = pipe,
    param_grid = hyperparam, 
    cv = 10,
    n_jobs = -1,
    return_train_score = True
)

opti.fit(
    X = X_train, 
    y = y_train.reshape(-1, 1))

print(opti.best_estimator_, "\n",
      opti.best_params_, "\n")

print("{0:8.4f} [CV average score of the best model]".format(
      opti.best_score_ ) )

bestmodel = opti.best_estimator_
print("{0:8.4f} [Performance on the leave-one out validation/test set]".format(
      r2_score(y_valid, bestmodel.predict(X_valid))) )

In [None]:
res = pd.DataFrame(opti.cv_results_)
testscores = res.filter(regex="_test_score").filter(regex="split")
cv_mu = testscores.mean(axis=1)
cv_se = testscores.std(axis=1)
ratio = cv_mu / cv_se
bestidx = ratio == ratio.max()
print(cv_mu[bestidx].values, cv_se[bestidx].values)
print(list(res[bestidx]['params']))
res[bestidx] 

## Submit it

In [None]:
# load raw data
df_test = pd.read_csv('../input/test.csv')

# data prep
x_test, _ = dataprep_transform(df_test, transformer, None, xcols)

# predict with fitted model
y_output = bestmodel.predict(x_test.values)
y_predicted = transformer['SalePrice'].inverse_transform(y_output.reshape(-1, 1))
y_predicted = np.expm1(y_predicted)

# export to pandas df and csv
result = pd.DataFrame(columns=['Id', 'SalePrice'], index=df_test.index)
result['Id'] = df_test['Id']
result['SalePrice'] = y_predicted
#result
result.to_csv('gbm-pca-12c.csv', index=False)