In [1]:
import pandas as pd
import numpy as np
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, OneHotEncoder

In [11]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

Use IDs as index

In [12]:
train_df.set_index('Id', inplace=True, verify_integrity=True)
test_df.set_index('Id', inplace=True, verify_integrity=True)

### Cleaning

The target variable has to be called **class** (tpot internal constraint)

In [13]:
train_df.rename(columns={'SalePrice': 'class'}, inplace=True)

#### Missing values imputation

In [14]:
train_df.LotFrontage.fillna(value=0.0, inplace=True)
train_df.Alley.fillna(value='No alley', inplace=True)
train_df[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']] = train_df[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']].fillna(value='No basement')
train_df.FireplaceQu.fillna(value='No fireplace', inplace=True)
train_df[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']] = train_df[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']].fillna(value='No garage')
train_df.PoolQC.fillna(value='No pool', inplace=True)
train_df.Fence.fillna(value='No fence', inplace=True)
train_df.MiscFeature.fillna(value='No extra', inplace=True)

train_df.MasVnrType.fillna(value='Unknown', inplace=True)
train_df.drop(['MasVnrArea', 'GarageYrBlt', 'MiscVal'], axis=1, inplace=True)

Remove one line with missing value for variable _Electrical_

In [18]:
train_df.drop(train_df.loc[train_df.Electrical.isnull()].index, inplace=True)

### Transform categorical variables into numerical

#### Label Binariser
Transform categorical variables into one-hot-encoded variables

In [19]:
lb_var_list = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 
               'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
               'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 
               'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 
               'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
               'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
lb_transfo_list = ['lb_' + var for var in lb_var_list]
lb_model_list = ['lb_' + var + '_model' for var in lb_var_list]
lb_train_list = [var + '_train' for var in lb_var_list]

lb_transfo_dict = {}
for transfo in lb_transfo_list:
    lb_transfo_dict[transfo] = LabelBinarizer()

lb_model_dict = {}
lb_train_dict = {}
for i in range(len(lb_var_list)):
    lb_model_dict[lb_model_list[i]] = lb_transfo_dict[lb_transfo_list[i]].fit(train_df[lb_var_list[i]])
    lb_train_dict[lb_train_list[i]] = lb_model_dict[lb_model_list[i]].transform(train_df[lb_var_list[i]])

In [21]:
len(lb_train_dict)

43

In [22]:
lb_train_dict['MSZoning_train'].shape

(1459, 5)

#### One Hot Encoder
Transform numerical variables into one-hot-encoded variables

In [30]:
ohe_var_list = ['MSSubClass', 'MoSold']
ohe_transfo_list = ['ohe_' + var for var in ohe_var_list]
ohe_model_list = ['ohe_' + var + '_model' for var in ohe_var_list]
ohe_train_list = [var + '_train' for var in ohe_var_list]

ohe_transfo_dict = {}
for transfo in ohe_transfo_list:
    ohe_transfo_dict[transfo] = OneHotEncoder(sparse=False)

ohe_model_dict = {}
ohe_train_dict = {}
for i in range(len(ohe_var_list)):
    ohe_model_dict[ohe_model_list[i]] = ohe_transfo_dict[ohe_transfo_list[i]].fit(train_df[ohe_var_list[i]].reshape(-1, 1))
    ohe_train_dict[ohe_train_list[i]] = ohe_model_dict[ohe_model_list[i]].transform(train_df[ohe_var_list[i]].reshape(-1, 1))

In [34]:
len(ohe_train_dict)

2

In [33]:
ohe_train_dict['MSSubClass_train'].shape

(1459, 15)


In [55]:
ohe_train_dict['MoSold_train'].shape

(1459, 12)

#### Concatenate transformed features

In [40]:
# Untouched variables
unt_var_list = list(set(train_df.columns) - set(lb_var_list) - set(ohe_var_list) - {'class'})

In [90]:
array_unt = train_df.as_matrix(columns=unt_var_list)
array_lb = np.hstack(tuple([value for value in lb_train_dict.itervalues()]))
array_ohe = np.hstack(tuple([value for value in ohe_train_dict.itervalues()]))

In [92]:
train_array = np.hstack((array_unt, array_lb, array_ohe))

In [93]:
train_array.shape

(1459, 322)

### tpot

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_array, train_df['class'], train_size=0.95, test_size=0.05)

tpot = TPOTRegressor(generations=100, population_size=100, verbosity=2, n_jobs=-1, random_state=26)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

# About the warning
# https://stackoverflow.com/questions/41238769/warning-messages-when-using-python
# https://github.com/rhiever/tpot/issues/284

28 operators have been imported by TPOT.
_pre_test decorator: _generate: num_test=0 Expected n_neighbors <= n_samples,  but n_samples = 50, n_neighbors = 66
_pre_test decorator: _generate: num_test=1 __init__() got an unexpected keyword argument 'max_depth'
_pre_test decorator: _generate: num_test=0 Unsupported set of arguments: The combination of penalty='l2' and loss='epsilon_insensitive' are not supported when dual=False, Parameters: penalty='l2', loss='epsilon_insensitive', dual=False
_pre_test decorator: _generate: num_test=0 __init__() got an unexpected keyword argument 'max_depth'
_pre_test decorator: _generate: num_test=0 __init__() got an unexpected keyword argument 'max_depth'
_pre_test decorator: _generate: num_test=0 Unsupported set of arguments: The combination of penalty='l2' and loss='epsilon_insensitive' are not supported when dual=False, Parameters: penalty='l2', loss='epsilon_insensitive', dual=False
_pre_test decorator: _generate: num_test=0 X contains negative value

Optimization Progress:   0%|          | 0/10100 [00:00<?, ?pipeline/s]


_pre_test decorator: _generate: num_test=0 __init__() got an unexpected keyword argument 'max_depth'
