In [238]:
import pandas as pd
import numpy as np
import numpy.ma as npma
import scipy as sc
from scipy import stats
from scipy.stats import norm, skew #for some statistics
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.kernel_ridge import KernelRidge
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from scipy.special import boxcox1p

from sklearn.linear_model import ElasticNet, Lasso, LassoLars,  BayesianRidge, LassoLarsIC, Ridge
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

#project_dir = 'C:\\Users\\PLDD\\Practice\\Projects\\Python\\ML\\Kaggle\\house_price'
project_dir ='C:\\Users\\PLDD\\python\\Python\\ML\\Kaggle\\house_price'
raw_path = os.path.join(project_dir,'data','raw')
train_path = os.path.join(raw_path, 'train.csv')
test_path = os.path.join(raw_path, 'test.csv')
sns.set()

In [262]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
test['SalePrice'] = 0.0
train['Id'] = 0
test['Id'] = 1
df = train.append(test)

class BaseTransformer:
    def fit(self, X, y=None):
        return self
    def fit_transform(self, X, y = None):
        self = self.fit(X)
        return self.transform(X)
    
class Transformer1(BaseTransformer):
    def __init__(self, aKeyfld, aKeyVal, aSets):
        self.sets = aSets
        self.keyfld = aKeyfld
        self.keyval = aKeyVal
    def transform(self, X):
        lx = (X[self.keyfld] == self.keyval).values.reshape(-1,1)
        for key in self.sets:
            masklx = lx & np.array([[el in self.sets[key] for el in X.columns]]).reshape(1,-1)
            X = X.mask(masklx, other = key )
        return X
    
class Transformer2(BaseTransformer):  #'BsmtFullBath','BsmtHalfBath' and cat fields try to generilize this function with Transformation7
    def __init__(self,aMeanFlds = [], aMedianFlds = []):
        self.meanflds = aMeanFlds
        self.medianflds = aMedianFlds
    def transform(self, X):
        concv = X[self.meanflds].mean(axis = 0).append(X[self.medianflds].median(axis = 0))
        concf = self.meanflds + self.medianflds
        lx = X[concf].isnull().values
        X[concf] = X[concf].mask(lx,np.broadcast_to(concv,lx.shape))
        return X
    
class Transformer3(BaseTransformer):
    def __init__(self,aFromFlds, aToFlds):
        self.fromflds = aFromFlds
        self.toflds = aToFlds
    def transform(self,X):
        lx = X[self.toflds].isnull().values
        X[self.toflds] = X[self.toflds].mask(lx, X[self.fromflds].values)
        return X
    
class Transformer4(BaseTransformer):  
    def __init__(self,aResFlds, aXFlds, aYFlds, aDropFlds):
        self.zflds = aResFlds
        self.xflds = aXFlds
        self.yflds = aYFlds
        self.dropflds = aDropFlds
    def transform(self,X):
        X[self.yflds] = X[self.xflds] - X[self.yflds].values
        X.columns = [self.zflds[self.yflds.index(el)] if el in self.yflds else el for el in X.columns]
        X = X.drop(columns = self.dropflds)
        return X
    
class Transformer5(BaseTransformer):  
    def __init__(self,aFlds):
        'None','Po', 'Fa', 'TA', 'Gd', 'Ex'
        self.sortmap = {'None':['Po','Grvl','No','Unf'],'Po':'Fa','Fa':'TA','TA':'Gd','Gd':'Ex','Ex':[],
                       'Sev':'Mod','Mod':'Gtl','Gtl':[],
                       'Reg':'IR1','IR1':'IR2','IR2':'IR3','IR3':[],
                       'N':'P','P':'Y','Y':[],
                       'Grvl':'Pave','Pave':[],
                       'No':'Mn','Mn':'Av','Av':'Gd',
                       'Unf':['LwQ','RFn'],'LwQ':'Rec','Rec':'BLQ','BLQ':'ALQ','ALQ':'GLQ','GLQ':[],
                       'RFn':'Fin','Fin':[]}
        self.flds = aFlds        
    def __sort_fields(self,values):
        sortedseq = []
        values = list(values)
        for i in range(len(values)):
            if values[i] is None: 
                continue
            subseq=[values[i]]
            subsortseq = []
            for key in subseq:
                if key in values:
                    values[values.index(key)] = None
                    subsortseq.append(key)
                    subseq += self.sortmap[key] if type(self.sortmap[key]) is list else [self.sortmap[key]]
            sortedseq = subsortseq + sortedseq
        return sortedseq       
    def transform(self,X):
        for fld in self.flds:
            uniquevals = X[~X[fld].isnull().values][fld].unique()
            X[fld] = pd.Categorical(X[fld].values,self.__sort_fields(uniquevals),ordered = True).codes
        lxm = X[self.flds] < 0.0
        X[self.flds] = X[self.flds].mask(lxm, np.nan)
        return X

# find 'LotFrontage' from 'LotFrontage','Neighborhood','LotConfig','LotArea' 
class Transformer6(BaseTransformer):
    def __init__(self,aLinearModel,aYFld,aXFlds,aScaler,aParamGrid = None,aCV = 5):
        if aParamGrid is None:
            self.solver = aLinearModel
        else:
            self.solver = GridSearchCV(estimator=self.linear_model,cv = aCV,param_grid=aParamGrid)
        self.scaler = aScaler
        self.yfld = aYFld
        self.xflds = aXFlds
    def transform(self,X):
        df = pd.get_dummies(X[self.xflds])
        lx = df[self.yfld].isnull()
        y = df[~lx][self.yfld].ravel()
        x = df.drop(self.yfld,axis = 1).values.astype('float')
        #x_train, x_test, y_train, y_test = train_test_split(x[~lx], y, test_size=0.4, random_state=0)
        #self.scaler = self.scaler.fit(x_train)
        #x_train = self.scaler.transform(x_train)
        #x_test = self.scaler.transform(x_test)
        #self.solver.fit(x_train, y_train)
        #print(solver.score(x_test, y_test))
        x_train = self.scaler.fit_transform(x[~lx])
        x_test = self.scaler.transform(x[lx])
        self.solver.fit(x_train, y) 
        y = self.solver.predict(x_test).astype('float')
        X.loc[lx,self.yfld] = y
        return X
    
 #e.g. 'Exterior1st','Exterior2nd','MasVnrType','Electrical','GarageType','SaleType','Functional'   
class Transformer7(BaseTransformer):
    def __init__(self,aCatFlds):
        self.flds = aCatFlds        
    def transform(self,X):
        oldflds = []
        for fld in self.flds:
            lx = X[fld].isnull()
            oldflds.append(fld)
            tmp = pd.get_dummies(X[[fld]],prefix = fld).astype(float)
            tmp.loc[lx,:] = tmp[~lx].sum().values / tmp[~lx].sum().sum()
            X[tmp.columns] = tmp
        X = X.drop(labels = oldflds,axis = 1)
        return X
    
#['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2'] from ['MasVnrType', 'BsmtFinType1','BsmtFinType2']
class Transformer8(BaseTransformer): 
    def __init__(self, aNumFlds, aCatFlds):
        self.numflds = aNumFlds
        self.catflds = aCatFlds
    def transform(self,X):
        for (catfld, numfld) in zip(self.catflds, self.numflds):
            lx = X[numfld].isnull()
            tmp = X.groupby([catfld])[numfld].describe()
            X.loc[lx,numfld] = np.sum(tmp['50%'] * tmp['count'] / tmp['count'].sum())
        return X
    
class Transformer9(BaseTransformer):
    def __init__(self, aResFld, aXFlds, aXSigns):
        self.zfld = aResFld
        self.xflds = aXFlds
        self.xsigns = np.array(aXSigns)
    def transform(self,X):
        lx = X[self.zfld].isnull()
        X.loc[lx,self.zfld] = (X.loc[lx,self.xflds] * self.xsigns).sum(axis = 1)
        return X
    
class Transformer10(BaseTransformer): # only for fields 'MSZoning','Neighborhood'
    def transform(self,X):
        lx=(X['MSZoning'].isnull()) & (X['Neighborhood'] == 'Mitchel')
        X.loc[lx,'MSZoning'] = 'RL'
        invalidlx = X['MSZoning'].isnull() 
        idotrrlx = X['Neighborhood'] == 'IDOTRR'
        tempdf = pd.get_dummies(X['MSZoning'],prefix = 'MSZoning').astype(float)
        tempdf.loc[invalidlx,:] = tempdf[idotrrlx].sum().values / tempdf[idotrrlx].sum().sum()
        X[tempdf.columns] = tempdf
        X = X.drop(labels = 'MSZoning',axis = 1)
        return X
    
class TreatOutliers(BaseTransformer): #only for outliers
    def transform(self,X):
        lx = (X['SalePrice'] <= 200000) & (X['GrLivArea'] >= 4000) 
        lx |= X['LotFrontage'] > 300 
        lx &= X['Id'] == 0 # zero mean training set
        X = X[~lx]
        return X

class FillNa(BaseTransformer): # at least for 'FireplaceQu','Alley','PoolQC','Fence','MiscFeature' 
    def __init__(self,aFlds,aVal = 'None'):
        self.flds = aFlds
        self.val = aVal
    def transform(self,X):
        X[self.flds] = X[self.flds].fillna(value = self.val)
        return X
    
class DropFields(BaseTransformer): # at least for 'Utilities'
    def __init__(self,aFlds):
        self.flds = aFlds
    def transform(self,X):
        X = X.drop(self.flds, axis = 1)
        return X
    
class TurnObjIntoNum(BaseTransformer):
    def transform(self,X):
        lx = (X.dtypes == 'object') & (~X.isnull().any(axis = 0))
        oldflds = X.columns[lx]
        tmp = pd.get_dummies(X[oldflds]).astype(float)
        X[tmp.columns] = tmp
        X = X.drop(labels = oldflds,axis = 1)
        return X

class PipeDecorator(Pipeline):
    def __init__(self,aSteps,aMemo=None):
        self.steps = aSteps
        self._validate_steps()
        self.memory = aMemo
        self.flds = None
    def transform(self,X):
        X = super().transform(X)
        #fun = Pipeline.transform.__get__(self)
        #X = fun(X)
        self.flds = X.columns
        return X
    def fit(self,X,y = None):
        self.flds = X.columns
        return Pipeline.fit(self,X)
    def get_feature_names(self):
        return self.flds
    def get_params(self,deep = True):
        return self.steps

###############################################################################################
def make_pipe_decorator(*aSteps):
    pipe = make_pipeline(*list(aSteps))
    return PipeDecorator(pipe.steps)

X = df.copy()
#tr1 = make_pipe_decorator(Transformer7(['GarageType']),Transformer7(['GarageCond']))
tr1 =make_pipe_decorator(Transformer1('GarageArea',0,{0 : ['GarageYrBlt','GarageCars'],
                                              'None' : ['GarageType','GarageFinish','GarageQual','GarageCond']}),
                   Transformer2(['GarageCars'],['GarageArea']),
                   Transformer3(['YearRemodAdd'],['GarageYrBlt']),
                   Transformer4(['BuiltAge','RenowateAge','GarageAge'],
                                ['YrSold']*3,['YearBuilt','YearRemodAdd','GarageYrBlt'],['MoSold']),
                   Transformer5(['GarageFinish','GarageQual','GarageCond']),
                   Transformer2(aMeanFlds = ['GarageFinish', 'GarageQual','GarageCond']),
                   Transformer7(['GarageType']))
tr1 = tr1.fit(X)
tr2 = make_pipe_decorator(Transformer1('TotalBsmtSF', 0,{0 : ['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
                                                        'BsmtFullBath','BsmtHalfBath'],
                                                'None' : ['BsmtQual','BsmtCond','BsmtExposure',
                                                          'BsmtFinType1','BsmtFinType2']}),
                    Transformer1('MasVnrArea',0,{'None' : ['MasVnrType']}),
                    Transformer1('MasVnrType','None',{0 : ['MasVnrArea']}),
                    Transformer8(['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2'],['MasVnrType', 'BsmtFinType1','BsmtFinType2']),
                    Transformer9('TotalBsmtSF',['BsmtFinSF1', 'BsmtFinSF2'], [1.0,1.0]),
                    Transformer9('BsmtUnfSF',['TotalBsmtSF','BsmtFinSF1', 'BsmtFinSF2'], [1.0,-1.0,-1.0]),
                    Transformer2(aMeanFlds = ['BsmtFullBath','BsmtHalfBath']),
                    Transformer5(['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']),
                    Transformer2(aMeanFlds = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']),
                    Transformer7(['MasVnrType'])
                   )
tr2 = tr2.fit(X)

tr3 = make_pipe_decorator(FillNa(['FireplaceQu','Alley','PoolQC','Fence','MiscFeature']),DropFields(['Utilities']),
                   Transformer5(['FireplaceQu','Alley','PoolQC']))
tr3 = tr3.fit(X)

#4. ['MSZoning','Neighborhood','LotFrontage','LotConfig','LotArea']
param_grid = {'alpha': [1e0,0.5, 0.1, 1e-2, 1e-3,1e-4,1e-5,1e-6],'gamma': np.logspace(-4, 2, 14)}
tr4 = make_pipe_decorator(Transformer6(Ridge(),'LotFrontage',['Neighborhood','LotFrontage','LotConfig','LotArea'],QuantileTransformer()),
                    Transformer10())
tr4 = tr4.fit(X)
#5. ['ExterQual','ExterCond','HeatingQC','LandSlope','LotShape','PavedDrive','Street','CentralAir','KitchenQual']
tr5 = make_pipe_decorator(Transformer5(['ExterQual','ExterCond','HeatingQC','LandSlope','LotShape',
                                 'PavedDrive','Street','CentralAir','KitchenQual']),
                    Transformer2(aMeanFlds = ['KitchenQual']))
tr5 = tr5.fit(X)
#6. ['Exterior1st','Exterior2nd','Electrical','SaleType','Functional']
tr6 = make_pipe_decorator(Transformer7(['Exterior1st','Exterior2nd','Electrical','SaleType','Functional']))
tr6 = tr6.fit(X)
#the rest fields
tr7 = make_pipe_decorator(TurnObjIntoNum())
tr7 = tr7.fit(X)

In [265]:
p = make_pipeline(Transformer7(['GarageType']),Transformer2(aMeanFlds = ['KitchenQual']))
p.get_params(deep = False)

{'memory': None,
 'steps': [('transformer7', <__main__.Transformer7 at 0x215fd129518>),
  ('transformer2', <__main__.Transformer2 at 0x215fd129c18>)]}

In [264]:

trs = ColumnTransformer([("tr1", tr1, ['GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual',
                                      'GarageCond','YearBuilt','YearRemodAdd','MoSold','YrSold']),
                        ("tr2",tr2,['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2',
                                    'BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','MasVnrArea',
                                    'MasVnrType']),
                        ("tr3",tr3,['FireplaceQu','Alley','PoolQC','Fence','MiscFeature','Utilities']),
                        ("tr4",tr4,['MSZoning','Neighborhood','LotFrontage','LotConfig','LotArea']),
                        ("tr5",tr5,['ExterQual','ExterCond','HeatingQC','LandSlope','LotShape',
                                    'PavedDrive','Street','CentralAir','KitchenQual']),
                        ("tr6",tr6,['Exterior1st','Exterior2nd','Electrical','SaleType','Functional'])
                        ])#,remainder = tr7)


trs = trs.fit(X)
#X = trs.transform(X)
#trs.get_feature_names()
#df = pd.DataFrame(data = X, columns = trs.get_feature_names())
#df.info()

AttributeError: 'list' object has no attribute 'items'

In [None]:
tr0 = TreatOutliers() # all fields

#1.Garage + Year
#['GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual',
#'GarageCond','YearBuilt','YearRemodAdd','MoSold','YrSold']  
tr1 =make_pipe_decorator(Transformer1('GarageArea',0,{0 : ['GarageYrBlt','GarageCars'],
                                              'None' : ['GarageType','GarageFinish','GarageQual','GarageCond']}),
                   Transformer2(['GarageCars'],['GarageArea']),
                   Transformer3(['YearRemodAdd'],['GarageYrBlt']),
                   Transformer4(['BuiltAge','RenowateAge','GarageAge'],
                                ['YrSold']*3,['YearBuilt','YearRemodAdd','GarageYrBlt'],['MoSold']),
                   Transformer5(['GarageFinish','GarageQual','GarageCond']),
                   Transformer2(aMeanFlds = ['GarageFinish', 'GarageQual','GarageCond']),
                   Transformer7(['GarageType']))

#2.basment
#['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2',
#'BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','MasVnrArea','MasVnrType']
tr2 = PipeDecorator(Transformer1('TotalBsmtSF', 0,{0 : ['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
                                                        'BsmtFullBath','BsmtHalfBath'],
                                                'None' : ['BsmtQual','BsmtCond','BsmtExposure',
                                                          'BsmtFinType1','BsmtFinType2']}),
                    Transformer1('MasVnrArea',0,{'None' : ['MasVnrType']}),
                    Transformer1('MasVnrType','None',{0 : ['MasVnrArea']}),
                    Transformer8(['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2'],['MasVnrType', 'BsmtFinType1','BsmtFinType2']),
                    Transformer9('TotalBsmtSF',['BsmtFinSF1', 'BsmtFinSF2'], [1.0,1.0]),
                    Transformer9('BsmtUnfSF',['TotalBsmtSF','BsmtFinSF1', 'BsmtFinSF2'], [1.0,-1.0,-1.0]),
                    Transformer2(aMeanFlds = ['BsmtFullBath','BsmtHalfBath']),
                    Transformer5(['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']),
                    Transformer2(aMeanFlds = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']),
                    Transformer7(['MasVnrType'])
                   )
                                   
#3.['FireplaceQu','Alley','PoolQC','Fence','MiscFeature','Utilities']
f3 = ['FireplaceQu','Alley','PoolQC','Fence','MiscFeature','Utilities']
tr3 = PipeDecorator(FillNa(['FireplaceQu','Alley','PoolQC','Fence','MiscFeature']),DropFields(['Utilities']),
                   Transformer5(['FireplaceQu','Alley','PoolQC']))

#4. ['MSZoning','Neighborhood','LotFrontage','LotConfig','LotArea']
param_grid = {'alpha': [1e0,0.5, 0.1, 1e-2, 1e-3,1e-4,1e-5,1e-6],'gamma': np.logspace(-4, 2, 14)}
tr4 = PipeDecorator(Transformer6(Ridge(),'LotFrontage',['Neighborhood','LotFrontage','LotConfig','LotArea'],QuantileTransformer()),
                    Transformer10())
#5. ['ExterQual','ExterCond','HeatingQC','LandSlope','LotShape','PavedDrive','Street','CentralAir','KitchenQual']
tr5 = PipeDecorator(Transformer5(['ExterQual','ExterCond','HeatingQC','LandSlope','LotShape',
                                 'PavedDrive','Street','CentralAir','KitchenQual']),
                    Transformer2(aMeanFlds = ['KitchenQual']))

#6. ['Exterior1st','Exterior2nd','Electrical','SaleType','Functional']
tr6 = PipeDecorator(Transformer7(['Exterior1st','Exterior2nd','Electrical','SaleType','Functional']))

#the rest fields
tr7 = PipeDecorator(TurnObjIntoNum())

trs = ColumnTransformer([("tr1", tr1, ['GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual',
                                      'GarageCond','YearBuilt','YearRemodAdd','MoSold','YrSold']),
                        ("tr2",tr2,['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2',
                                    'BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','MasVnrArea',
                                    'MasVnrType']),
                        ("tr3",tr3,['FireplaceQu','Alley','PoolQC','Fence','MiscFeature','Utilities']),
                        ("tr4",tr4,['MSZoning','Neighborhood','LotFrontage','LotConfig','LotArea']),
                        ("tr5",tr5,['ExterQual','ExterCond','HeatingQC','LandSlope','LotShape',
                                    'PavedDrive','Street','CentralAir','KitchenQual']),
                        ("tr6",tr6,['Exterior1st','Exterior2nd','Electrical','SaleType','Functional'])
                        ],remainder = tr7)
#TrData = make_pipeline(tr0,trs)
X = df.copy()
#trs = trs.fit(X)
#X = trs.transform(X)
#trs.get_feature_names()
#df = pd.DataFrame(data = X, columns = trs.get_feature_names())
#df.info()