Data preprocessing includes the following steps:


*   dropping redundant features (defined by feature selection methods)
*   filling some missing categorical values with defaults
*   filling remaining undefined categorical values with 'None' value
*   filling missing numerical values with the help of KNN method
*   encode categorical variables with one-hot method (label encoding and ordinal encoding, when it's relevant, worked worse than one-hot)
*   removing outliers

In [45]:
!pip install -q fancyimpute

In [46]:
%cd '/content/drive/MyDrive/Colab Notebooks/kaggle/house_pricing'

/content/drive/MyDrive/Colab Notebooks/kaggle/house_pricing


In [47]:
from common import *
from sklearn.linear_model import Lasso
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import GridSearchCV
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, BiScaler
from sklearn.base import TransformerMixin
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import IsolationForest
from xgboost import XGBRegressor
from sklearn.cluster import KMeans

In [48]:
from common import *

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from fancyimpute import KNN, SoftImpute, BiScaler
import scipy.stats as st

In [49]:
def preprocess(x):
    x = x.fillna('None')
    return pd.get_dummies(x)

In [50]:
def create_missing_indicators(x, cols=['LotFrontage', 'MasVnrArea']):
    for col in cols:
        if (x[col].isna().sum() == 0): continue
        x[col + 'Missing'] = x[col].isna().astype(int)

In [51]:
def impute_missing_values(x):
    x['Functional'].replace(np.nan, 'Typ', regex=True, inplace=True)
    x.drop('Utilities', axis=1, inplace=True)
    
    num_cols = x.columns[x.dtypes != object].tolist()
    cat_cols = x.columns[x.dtypes == object].tolist()

    # imp = SoftImpute(verbose=0)
    # imp = IterativeImputer(verbose=0)
    
    imp = KNN(k=5, verbose=0)
    x[num_cols] = imp.fit_transform(x[num_cols])
    x[cat_cols] = x[cat_cols].fillna('None')

In [52]:
def convert_types(x, cols):
    for col in cols:
        x[col] = x[col].astype('str')

In [53]:
def handle_skewness(x):
    num_cols = x.columns[x.dtypes != object].to_list()

    skewed_feats = x[num_cols].apply(lambda x: st.skew(x.dropna()))
    skewed_feats = skewed_feats.sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew Before Transformation' :skewed_feats})

    skewness = skewness[abs(skewness) > 1].dropna(axis=0)
    skewed_features = skewness.index
    for feat in skewed_features:
        x[feat] = np.log1p(x[feat]+1)


In [54]:
def encode_ordinal(x):
    ord_cols = ['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
        'BsmtFinSF2', 'HeatingQC', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 
        'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 
        'Fireplaces', 'FireplaceQu', 'GarageFinish', 'GarageCars', 'GarageQual', 
        'GarageCond', 'PoolQC', 'Fence', 'YearBuilt', 'YearRemodAdd', 
        'GarageYrBlt', 'MoSold', 'YrSold']

    for col in ord_cols:
        label = LabelEncoder()
        label.fit(list(x[col].values))
        x[col] = label.transform(list(x[col].values))


In [55]:
def remove_outliers(x, y):
  outlier_remover = IsolationForest(n_estimators=300, contamination=0.006, random_state=0)
  outlier_remover.fit(x)
  preds = outlier_remover.predict(x)
  x_inliers = x[preds == 1]
  y_inliers = y[preds == 1]
  return x_inliers, y_inliers

---
Main code

In [56]:
x_train, y_train, x_test = read_data()
x_total = pd.concat((x_train, x_test))

create_missing_indicators(x_total)
impute_missing_values(x_total)

x_total_processed = preprocess(x_total)
x_train = x_total_processed[:x_train.shape[0]]
x_test = x_total_processed[x_train.shape[0]:]
x_train, y_train = remove_outliers(x_train, y_train)
print(x_train.shape, y_train.shape, x_test.shape)

  "X does not have valid feature names, but"


(1451, 308) (1451,) (1459, 308)


In [61]:
%%time
params = {'n_estimators': 3458, 'min_child_weight': 1, 'max_depth': 5, 
          'learning_rate': 0.007, 'lambda': 81, 'gamma': 90, 
          'colsample_bytree': 0.4, 'subsample': 0.2, 'alpha': 159, 
          'objective': 'reg:squarederror'}

model = XGBRegressor(**params)
print(get_cv_score(x_train, y_train, model))

13531.264243976999
CPU times: user 1min 46s, sys: 431 ms, total: 1min 46s
Wall time: 1min 46s


---
####Exploring creation of missing indicators

In [None]:
def estimate_missing_indicators(cols):
    x_train, y_train, x_test = read_data()
    x_total = pd.concat((x_train, x_test))
    create_missing_indicators(x_total, cols)
    impute_missing_values(x_total)

    x_total_processed = preprocess(x_total)
    x_train = x_total_processed[:x_train.shape[0]]
    x_test = x_total_processed[x_train.shape[0]:]
    x_train, y_train = remove_outliers(x_train, y_train)
    return get_score(x_train, y_train, model)

In [None]:
x_train, y_train, x_test = read_data()
x_total = pd.concat((x_train, x_test))

#numeric columns with at least one nan value
cols = x_total.columns[x_total.dtypes != object]
cols = x_total[cols].isna().sum()
cols = cols[cols > 0].index.to_list()
print('columns to explore:', cols)

suitable_cols = []
best_score = 14342 #estimate_indicators([])
print('initial best score', best_score)
for col in cols:
    print('processing column', col)
    tmp_cols = suitable_cols[:]
    tmp_cols.append(col)
    score = estimate_indicators(tmp_cols)
    print(score, tmp_cols)
    if (score < best_score):
        best_score = score
        suitable_cols.append(col)
        print('best score, adding col', col)

---
####Exploring type conversion (numeric to categorical)

In [None]:
def estimate_type_conversion(cols):
    x_train, y_train, x_test = read_data()
    x_total = pd.concat((x_train, x_test))
    create_missing_indicators(x_total)
    impute_missing_values(x_total)
    convert_types(x_total, cols)
    x_total_processed = preprocess(x_total)
    x_train = x_total_processed[:x_train.shape[0]]
    x_test = x_total_processed[x_train.shape[0]:]
    x_train, y_train = remove_outliers(x_train, y_train)
    return get_score(x_train, y_train, model)

In [None]:
x_train, y_train, x_test = read_data()
x_total = pd.concat((x_train, x_test))
num_cols = x_total.columns[x_total.dtypes != object].tolist()
cols = []

for col in num_cols:
    cnt = len(x_total[col].unique())
    if (cnt < 20):
        cols.append(col)

cols = ['MoSold']
suitable_cols = []
best_score = 13363
print('initial best score', best_score)
for col in cols:
    print('processing column', col)
    tmp_cols = suitable_cols[:]
    tmp_cols.append(col)
    score = estimate_type_conversion(tmp_cols)
    print(score, tmp_cols)
    if (score < best_score):
        best_score = score
        suitable_cols.append(col)
        print('best score, adding col', col)

---
####Exploring categorical imputation with mode

In [None]:
def estimate_mode_imputation(cols):
    x_train, y_train, x_test = read_data()
    x_total = pd.concat((x_train, x_test))
    create_missing_indicators(x_total)
    
    if (len(cols) > 0):
        x_total[cols] = x_total[cols].fillna(x_total[cols].mode().loc[0])

    impute_missing_values(x_total)
    x_total_processed = preprocess(x_total)
    x_train = x_total_processed[:x_train.shape[0]]
    x_test = x_total_processed[x_train.shape[0]:]
    x_train, y_train = remove_outliers(x_train, y_train)
    return get_score(x_train, y_train, model)

In [None]:
x_train, y_train, x_test = read_data()
x_total = pd.concat((x_train, x_test))
cols = x_total.columns[x_total.dtypes == object].tolist()
for col in cols:
    score = estimate_mode_imputation([col])
    print(col, score)