In [1]:
import pandas as pd
import numpy as np
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, OneHotEncoder, Imputer

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

Use IDs as index

In [3]:
train_df.set_index('Id', inplace=True, verify_integrity=True)
test_df.set_index('Id', inplace=True, verify_integrity=True)

### Cleaning

The target variable has to be called **class** (tpot internal constraint)

In [4]:
train_df.rename(columns={'SalePrice': 'class'}, inplace=True)

#### Missing values imputation

In [5]:
train_df.LotFrontage.fillna(value=0.0, inplace=True)
train_df.Alley.fillna(value='No alley', inplace=True)
train_df[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']] = train_df[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']].fillna(value='No basement')
train_df.FireplaceQu.fillna(value='No fireplace', inplace=True)
train_df[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']] = train_df[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']].fillna(value='No garage')
train_df.PoolQC.fillna(value='No pool', inplace=True)
train_df.Fence.fillna(value='No fence', inplace=True)
train_df.MiscFeature.fillna(value='No extra', inplace=True)

train_df.MasVnrType.fillna(value='Unknown', inplace=True)
train_df.drop(['MasVnrArea', 'GarageYrBlt', 'MiscVal'], axis=1, inplace=True)

Fill missing value missing value for variable _Electrical_ with a possible value

In [6]:
print train_df.loc[train_df.Electrical.isnull(), 'Electrical']

Id
1380    NaN
Name: Electrical, dtype: object


In [7]:
train_df.Electrical.describe()

count      1459
unique        5
top       SBrkr
freq       1334
Name: Electrical, dtype: object

In [8]:
train_df.Electrical.fillna(value='SBrkr', inplace=True)

In [9]:
# # train_df.drop(train_df.loc[train_df.Electrical.isnull()].index, inplace=True)
# imputer = Imputer(strategy='most_frequent', axis=0, copy=True)
# impute_model = imputer.fit(train_df)
# train_df_ = impute_model.transform(train_df)
# print train_df_.loc[train_df.Electrical.isnull(), 'Electrical']

### Transform categorical variables into numerical

#### Label Binariser
Transform categorical variables into one-hot-encoded variables

In [10]:
lb_var_list = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 
               'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
               'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 
               'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 
               'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
               'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
lb_transfo_list = ['lb_' + var for var in lb_var_list]
lb_model_list = ['lb_' + var + '_model' for var in lb_var_list]
lb_train_list = [var + '_train' for var in lb_var_list]

lb_transfo_dict = {}
for transfo in lb_transfo_list:
    lb_transfo_dict[transfo] = LabelBinarizer()

lb_model_dict = {}
lb_train_dict = {}
for i in range(len(lb_var_list)):
    lb_model_dict[lb_model_list[i]] = lb_transfo_dict[lb_transfo_list[i]].fit(train_df[lb_var_list[i]])
    lb_train_dict[lb_train_list[i]] = lb_model_dict[lb_model_list[i]].transform(train_df[lb_var_list[i]])

In [11]:
len(lb_train_dict)

43

In [12]:
lb_train_dict['MSZoning_train'].shape

(1460, 5)

#### One Hot Encoder
Transform numerical variables into one-hot-encoded variables

In [13]:
ohe_var_list = ['MSSubClass', 'MoSold']
ohe_transfo_list = ['ohe_' + var for var in ohe_var_list]
ohe_model_list = ['ohe_' + var + '_model' for var in ohe_var_list]
ohe_train_list = [var + '_train' for var in ohe_var_list]

ohe_transfo_dict = {}
for transfo in ohe_transfo_list:
    ohe_transfo_dict[transfo] = OneHotEncoder(sparse=False)

ohe_model_dict = {}
ohe_train_dict = {}
for i in range(len(ohe_var_list)):
    ohe_model_dict[ohe_model_list[i]] = ohe_transfo_dict[ohe_transfo_list[i]].fit(train_df[ohe_var_list[i]].reshape(-1, 1))
    ohe_train_dict[ohe_train_list[i]] = ohe_model_dict[ohe_model_list[i]].transform(train_df[ohe_var_list[i]].reshape(-1, 1))

In [14]:
len(ohe_train_dict)

2

In [15]:
ohe_train_dict['MSSubClass_train'].shape

(1460, 15)

In [16]:
ohe_train_dict['MoSold_train'].shape

(1460, 12)

#### Concatenate transformed features

In [17]:
# Untouched variables
unt_var_list = list(set(train_df.columns) - set(lb_var_list) - set(ohe_var_list) - {'class'})

In [18]:
array_unt = train_df.as_matrix(columns=unt_var_list)
array_lb = np.hstack(tuple([value for value in lb_train_dict.itervalues()]))
array_ohe = np.hstack(tuple([value for value in ohe_train_dict.itervalues()]))

In [19]:
train_array = np.hstack((array_unt, array_lb, array_ohe))

In [20]:
train_array.shape

(1460, 322)

### tpot

In [21]:
X_train, X_test, y_train, y_test = train_test_split(train_array, train_df['class'], train_size=0.95, test_size=0.05)

tpot = TPOTRegressor(generations=5, population_size=5, verbosity=2, n_jobs=-1, random_state=26)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

# About the warning
# https://stackoverflow.com/questions/41238769/warning-messages-when-using-python
# https://github.com/rhiever/tpot/issues/284

Optimization Progress:  33%|███▎      | 10/30 [00:28<02:17,  6.90s/pipeline]

Generation 1 - Current best internal CV score: 1017275528.85


Optimization Progress:  50%|█████     | 15/30 [00:43<01:34,  6.30s/pipeline]

Generation 2 - Current best internal CV score: 1017275528.85


Optimization Progress:  67%|██████▋   | 20/30 [00:51<00:45,  4.55s/pipeline]

Generation 3 - Current best internal CV score: 891179999.545


Optimization Progress:  83%|████████▎ | 25/30 [00:59<00:15,  3.10s/pipeline]

Generation 4 - Current best internal CV score: 891179999.545


                                                                            

Generation 5 - Current best internal CV score: 891179999.545

Best pipeline: RidgeCV(GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8, GradientBoostingRegressor__learning_rate=DEFAULT, GradientBoostingRegressor__loss=lad, GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.55, GradientBoostingRegressor__min_samples_leaf=2, GradientBoostingRegressor__min_samples_split=3, GradientBoostingRegressor__n_estimators=DEFAULT, GradientBoostingRegressor__subsample=0.75))
792976844.096




In [22]:
tpot.export('tpot_exported_pipeline.py')

In [None]:
tpot.evaluated_individuals_