In [53]:
import pandas as pd
import numpy as np
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, OneHotEncoder, Imputer
from sklearn.metrics import mean_squared_log_error

## read data

In [92]:
train_df = pd.read_csv('/var/data/train.csv')
test_df = pd.read_csv('/var/data/test.csv')

In [48]:
print(train_df.shape)
print(test_df.shape)

(1460, 81)
(1459, 80)


## data preprocessing

### use IDs as index

In [93]:
train_df.set_index('Id', inplace=True, verify_integrity=True)
test_df.set_index('Id', inplace=True, verify_integrity=True)

### cleaning

The target variable has to be called **class** (tpot internal constraint)

In [94]:
train_df.rename(columns={'SalePrice': 'class'}, inplace=True)

### missing values imputation

Based on [kaggle doc](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)

In [95]:
# on train set
train_df.LotFrontage.fillna(value=0.0, inplace=True)
train_df.Alley.fillna(value='No alley', inplace=True)
train_df[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']] = train_df[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']].fillna(value='No basement')
train_df.FireplaceQu.fillna(value='No fireplace', inplace=True)
train_df[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']] = train_df[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']].fillna(value='No garage')
train_df.PoolQC.fillna(value='No pool', inplace=True)
train_df.Fence.fillna(value='No fence', inplace=True)
train_df.MiscFeature.fillna(value='No extra', inplace=True)

train_df.MasVnrType.fillna(value='Unknown', inplace=True)
train_df.drop(['MasVnrArea', 'GarageYrBlt', 'MiscVal'], axis=1, inplace=True)

Fill missing value for variable _Electrical_ with a possible value

In [8]:
print(train_df.loc[train_df.Electrical.isnull(), 'Electrical'])

Id
1380    NaN
Name: Electrical, dtype: object


In [9]:
train_df.Electrical.describe()

count      1459
unique        5
top       SBrkr
freq       1334
Name: Electrical, dtype: object

In [96]:
train_df.Electrical.fillna(value='SBrkr', inplace=True)

In [97]:
# on test set
test_df.LotFrontage.fillna(value=0.0, inplace=True)
test_df.Alley.fillna(value='No alley', inplace=True)
test_df[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']] = test_df[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']].fillna(value='No basement')
test_df.FireplaceQu.fillna(value='No fireplace', inplace=True)
test_df[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']] = test_df[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']].fillna(value='No garage')
test_df.PoolQC.fillna(value='No pool', inplace=True)
test_df.Fence.fillna(value='No fence', inplace=True)
test_df.MiscFeature.fillna(value='No extra', inplace=True)

test_df.MasVnrType.fillna(value='Unknown', inplace=True)
test_df.drop(['MasVnrArea', 'GarageYrBlt', 'MiscVal'], axis=1, inplace=True)

In [98]:
test_df.Electrical.fillna(value='SBrkr', inplace=True)

In [75]:
test_df.MSZoning.describe()

count     1455
unique       5
top         RL
freq      1114
Name: MSZoning, dtype: object

In [74]:
print(test_df.loc[test_df.MSZoning.isnull(), 'MSZoning'])

Id
1916    NaN
2217    NaN
2251    NaN
2905    NaN
Name: MSZoning, dtype: object


In [101]:
test_df.MSZoning.fillna(value='RL', inplace=True)

In [79]:
test_df.Utilities.describe()

count       1457
unique         1
top       AllPub
freq        1457
Name: Utilities, dtype: object

In [102]:
test_df.Utilities.fillna(value='AllPub', inplace=True)

In [83]:
test_df.Exterior1st.describe()

count        1458
unique         13
top       VinylSd
freq          510
Name: Exterior1st, dtype: object

In [103]:
test_df.Exterior1st.fillna(value='VinylSd', inplace=True)

In [87]:
test_df.Exterior2nd.describe()

count        1458
unique         15
top       VinylSd
freq          510
Name: Exterior2nd, dtype: object

In [104]:
test_df.Exterior2nd.fillna(value='VinylSd', inplace=True)

In [108]:
test_df.KitchenQual.describe()

count     1458
unique       4
top         TA
freq       757
Name: KitchenQual, dtype: object

In [109]:
test_df.KitchenQual.fillna(value='TA', inplace=True)

In [112]:
test_df.Functional.describe()

count     1457
unique       7
top        Typ
freq      1357
Name: Functional, dtype: object

In [114]:
test_df.Functional.fillna(value='Typ', inplace=True)

In [117]:
test_df.SaleType.describe()

count     1458
unique       9
top         WD
freq      1258
Name: SaleType, dtype: object

In [118]:
test_df.SaleType.fillna(value='WD', inplace=True)

### transform categorical variables into numerical (scikit-learn requirement)

#### Label Binariser
Transform categorical variables into one-hot-encoded variables

In [119]:
lb_var_list = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 
               'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
               'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 
               'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 
               'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
               'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
lb_transfo_list = ['lb_' + var for var in lb_var_list]
lb_model_list = ['lb_' + var + '_model' for var in lb_var_list]
lb_train_list = [var + '_train' for var in lb_var_list]

lb_transfo_dict = {}
for transfo in lb_transfo_list:
    lb_transfo_dict[transfo] = LabelBinarizer()

lb_model_dict = {}
lb_train_dict = {}
lb_test_dict = {}
for i in range(len(lb_var_list)):
    lb_model_dict[lb_model_list[i]] = lb_transfo_dict[lb_transfo_list[i]].fit(train_df[lb_var_list[i]])
    lb_train_dict[lb_train_list[i]] = lb_model_dict[lb_model_list[i]].transform(train_df[lb_var_list[i]])
    # transform test set
    lb_test_dict[lb_train_list[i]] = lb_model_dict[lb_model_list[i]].transform(test_df[lb_var_list[i]])

In [12]:
len(lb_train_dict)

43

#### One Hot Encoder
Transform numerical variables into one-hot-encoded variables

In [120]:
ohe_var_list = ['MSSubClass', 'MoSold']
ohe_transfo_list = ['ohe_' + var for var in ohe_var_list]
ohe_model_list = ['ohe_' + var + '_model' for var in ohe_var_list]
ohe_train_list = [var + '_train' for var in ohe_var_list]

ohe_transfo_dict = {}
for transfo in ohe_transfo_list:
    ohe_transfo_dict[transfo] = OneHotEncoder(sparse=False)

ohe_model_dict = {}
ohe_train_dict = {}
ohe_test_dict = {}
for i in range(len(ohe_var_list)):
    ohe_model_dict[ohe_model_list[i]] = ohe_transfo_dict[ohe_transfo_list[i]].fit(train_df[ohe_var_list[i]].values.reshape(-1, 1))
    ohe_train_dict[ohe_train_list[i]] = ohe_model_dict[ohe_model_list[i]].transform(train_df[ohe_var_list[i]].values.reshape(-1, 1))
    # transform test set
    ohe_test_dict[ohe_train_list[i]] = ohe_model_dict[ohe_model_list[i]].transform(test_df[ohe_var_list[i]].values.reshape(-1, 1))
    

In [15]:
len(ohe_train_dict)

2

#### Concatenate transformed features into numpy array

In [61]:
# Untouched variables
unt_var_list = list(set(train_df.columns) - set(lb_var_list) - set(ohe_var_list) - {'class'})

In [62]:
array_unt = train_df.as_matrix(columns=unt_var_list)
array_lb = np.hstack(tuple([value[1] for value in lb_train_dict.items()]))
array_ohe = np.hstack(tuple([value[1] for value in ohe_train_dict.items()]))

In [63]:
train_array = np.hstack((array_unt, array_lb, array_ohe))

In [39]:
train_array.shape

(1460, 322)

In [121]:
# create test_array
array_unt = test_df.as_matrix(columns=unt_var_list)
array_lb = np.hstack(tuple([value[1] for value in lb_test_dict.items()]))
array_ohe = np.hstack(tuple([value[1] for value in ohe_test_dict.items()]))

test_array = np.hstack((array_unt, array_lb, array_ohe))

## tpot 

### customised scoring function

Kaggle requirement: Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price.

In [65]:
def log_rmse(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred)

### fit

In [66]:
X_train, X_test, y_train, y_test = train_test_split(train_array, train_df['class'], train_size=0.95, test_size=0.05)

tpot = TPOTRegressor(generations=100, population_size=10, verbosity=2, scoring=log_rmse, n_jobs=-1, random_state=26)
tpot.fit(X_train, y_train)

# About the warning
# https://stackoverflow.com/questions/41238769/warning-messages-when-using-python
# https://github.com/rhiever/tpot/issues/284

Optimization Progress:   2%|▏         | 20/1010 [00:26<1:25:30,  5.18s/pipeline]

Generation 1 - Current best internal CV score: 0.1257970211387605


Optimization Progress:   3%|▎         | 30/1010 [01:11<1:20:36,  4.94s/pipeline]

Generation 2 - Current best internal CV score: 0.1257970211387605


Optimization Progress:   4%|▍         | 40/1010 [01:23<57:58,  3.59s/pipeline]  

Generation 3 - Current best internal CV score: 0.1272284492287556


Optimization Progress:   5%|▍         | 50/1010 [01:26<31:27,  1.97s/pipeline]

Generation 4 - Current best internal CV score: 0.16800958532208238


Optimization Progress:   6%|▌         | 60/1010 [01:28<18:12,  1.15s/pipeline]

Generation 5 - Current best internal CV score: 0.16804950561722615


Optimization Progress:   7%|▋         | 70/1010 [01:30<14:16,  1.10pipeline/s]

Generation 6 - Current best internal CV score: 0.16804950561722615


Optimization Progress:   8%|▊         | 80/1010 [01:32<10:06,  1.53pipeline/s]

Generation 7 - Current best internal CV score: 0.16805180132169442


Optimization Progress:   9%|▉         | 90/1010 [01:39<15:00,  1.02pipeline/s]

Generation 8 - Current best internal CV score: 0.17216167317413428


Optimization Progress:  10%|█         | 101/1010 [39:54<60:55:41, 241.30s/pipeline]

Generation 9 - Current best internal CV score: 0.17216167317413428


Optimization Progress:  11%|█         | 111/1010 [40:03<29:44:50, 119.12s/pipeline]

Generation 10 - Current best internal CV score: 0.17216167317413428


Optimization Progress:  12%|█▏        | 121/1010 [40:08<20:41:46, 83.81s/pipeline] 

Generation 11 - Current best internal CV score: 0.20835816422084955


Optimization Progress:  13%|█▎        | 131/1010 [40:21<7:19:46, 30.02s/pipeline] 

Generation 12 - Current best internal CV score: 0.20835816422084955


Optimization Progress:  14%|█▍        | 141/1010 [40:39<4:00:29, 16.60s/pipeline]

Generation 13 - Current best internal CV score: 0.3314262056479383


Optimization Progress:  15%|█▍        | 151/1010 [40:46<2:07:13,  8.89s/pipeline]

Generation 14 - Current best internal CV score: 0.3314262056479383


Optimization Progress:  16%|█▌        | 161/1010 [40:49<1:32:28,  6.54s/pipeline]

Generation 15 - Current best internal CV score: 0.3314262056479383


Optimization Progress:  17%|█▋        | 171/1010 [41:01<42:48,  3.06s/pipeline]  

Generation 16 - Current best internal CV score: 0.3314262056479383


Optimization Progress:  18%|█▊        | 181/1010 [41:17<1:34:19,  6.83s/pipeline]

Generation 17 - Current best internal CV score: 0.3314262056479383


Optimization Progress:  19%|█▉        | 191/1010 [41:33<1:36:48,  7.09s/pipeline]

Generation 18 - Current best internal CV score: 0.3314262056479383


Optimization Progress:  20%|█▉        | 201/1010 [41:51<1:56:11,  8.62s/pipeline]

Generation 19 - Current best internal CV score: 0.3314262056479383


Optimization Progress:  21%|██        | 211/1010 [42:31<3:35:06, 16.15s/pipeline]

Generation 20 - Current best internal CV score: 0.3314262056479383


Optimization Progress:  22%|██▏       | 221/1010 [42:44<2:37:20, 11.96s/pipeline]

Generation 21 - Current best internal CV score: 0.348724143550921


Optimization Progress:  23%|██▎       | 231/1010 [42:58<2:05:11,  9.64s/pipeline]

Generation 22 - Current best internal CV score: 0.348724143550921


Optimization Progress:  24%|██▍       | 241/1010 [43:13<1:21:45,  6.38s/pipeline]

Generation 23 - Current best internal CV score: 0.348724143550921


Optimization Progress:  25%|██▍       | 251/1010 [43:24<53:45,  4.25s/pipeline]  

Generation 24 - Current best internal CV score: 0.348724143550921


Optimization Progress:  26%|██▌       | 261/1010 [43:37<43:20,  3.47s/pipeline]  

Generation 25 - Current best internal CV score: 0.348724143550921


Optimization Progress:  27%|██▋       | 271/1010 [44:07<59:14,  4.81s/pipeline]  

Generation 26 - Current best internal CV score: 0.348724143550921


Optimization Progress:  28%|██▊       | 281/1010 [44:24<1:13:04,  6.01s/pipeline]

Generation 27 - Current best internal CV score: 0.348724143550921


Optimization Progress:  29%|██▉       | 291/1010 [45:57<6:07:44, 30.69s/pipeline]

Generation 28 - Current best internal CV score: 0.348724143550921


Optimization Progress:  30%|██▉       | 301/1010 [46:15<2:18:49, 11.75s/pipeline]

Generation 29 - Current best internal CV score: 0.35192681532719805


Optimization Progress:  31%|███       | 311/1010 [48:56<7:40:48, 39.55s/pipeline] 

Generation 30 - Current best internal CV score: 0.35192681532719805


Optimization Progress:  32%|███▏      | 321/1010 [49:02<5:24:35, 28.27s/pipeline]

Generation 31 - Current best internal CV score: 0.35192681532719805


Optimization Progress:  33%|███▎      | 331/1010 [49:13<2:49:16, 14.96s/pipeline]

Generation 32 - Current best internal CV score: 0.35192681532719805


Optimization Progress:  34%|███▍      | 341/1010 [50:23<3:17:35, 17.72s/pipeline]

Generation 33 - Current best internal CV score: 0.36301900646018603


Optimization Progress:  35%|███▍      | 351/1010 [50:32<1:37:40,  8.89s/pipeline]

Generation 34 - Current best internal CV score: 0.36301900646018603


Optimization Progress:  36%|███▌      | 361/1010 [50:39<1:19:07,  7.32s/pipeline]

Generation 35 - Current best internal CV score: 0.36301900646018603


Optimization Progress:  37%|███▋      | 371/1010 [51:01<1:14:57,  7.04s/pipeline]

Generation 36 - Current best internal CV score: 0.665858571449524


Optimization Progress:  38%|███▊      | 381/1010 [51:26<2:10:28, 12.45s/pipeline]

Generation 37 - Current best internal CV score: 0.665858571449524


Optimization Progress:  39%|███▊      | 391/1010 [51:57<1:16:43,  7.44s/pipeline]

Generation 38 - Current best internal CV score: 0.665858571449524


Optimization Progress:  40%|███▉      | 401/1010 [54:44<5:07:54, 30.34s/pipeline]

Generation 39 - Current best internal CV score: 5.523208452237935


Optimization Progress:  41%|████      | 411/1010 [57:34<7:37:23, 45.82s/pipeline] 

Generation 40 - Current best internal CV score: 5.523208452237935


Optimization Progress:  42%|████▏     | 421/1010 [57:53<4:20:57, 26.58s/pipeline]

Generation 41 - Current best internal CV score: 5.523208452237935


Optimization Progress:  43%|████▎     | 431/1010 [58:09<3:22:00, 20.93s/pipeline]

Generation 42 - Current best internal CV score: 5.619466328713469


Optimization Progress:  44%|████▎     | 441/1010 [58:30<2:37:03, 16.56s/pipeline]

Generation 43 - Current best internal CV score: 5.619466328713469


Optimization Progress:  45%|████▍     | 451/1010 [1:01:13<6:11:39, 39.89s/pipeline]

Generation 44 - Current best internal CV score: 5.619884548769443


Optimization Progress:  46%|████▌     | 461/1010 [1:01:34<5:11:08, 34.01s/pipeline]

Generation 45 - Current best internal CV score: 5.620308835534726


Optimization Progress:  47%|████▋     | 472/1010 [1:06:35<15:57:57, 106.84s/pipeline]

Generation 46 - Current best internal CV score: 5.631804670996543


Optimization Progress:  48%|████▊     | 482/1010 [1:07:09<9:11:22, 62.66s/pipeline]  

Generation 47 - Current best internal CV score: 5.657557421797219


Optimization Progress:  49%|████▊     | 492/1010 [1:07:33<3:26:18, 23.90s/pipeline]

Generation 48 - Current best internal CV score: 5.657557421797219


Optimization Progress:  50%|████▉     | 502/1010 [1:07:54<2:17:09, 16.20s/pipeline]

Generation 49 - Current best internal CV score: 5.659718348040328


Optimization Progress:  51%|█████     | 512/1010 [1:08:04<1:47:00, 12.89s/pipeline]

Generation 50 - Current best internal CV score: 5.659718348040328


Optimization Progress:  52%|█████▏    | 522/1010 [1:08:17<1:06:29,  8.18s/pipeline]

Generation 51 - Current best internal CV score: 5.659718348040328


Optimization Progress:  53%|█████▎    | 533/1010 [1:13:18<6:30:25, 49.11s/pipeline]

Generation 52 - Current best internal CV score: 5.66015092865084


Optimization Progress:  54%|█████▍    | 543/1010 [1:13:30<3:33:36, 27.44s/pipeline]

Generation 53 - Current best internal CV score: 5.66015092865084


Optimization Progress:  55%|█████▍    | 553/1010 [1:13:55<2:38:09, 20.77s/pipeline]

Generation 54 - Current best internal CV score: 5.6742636492086955


Optimization Progress:  56%|█████▌    | 563/1010 [1:14:19<1:42:09, 13.71s/pipeline]

Generation 55 - Current best internal CV score: 5.706256122575477


Optimization Progress:  57%|█████▋    | 573/1010 [1:14:46<1:18:20, 10.76s/pipeline]

Generation 56 - Current best internal CV score: 5.706547087479816


Optimization Progress:  58%|█████▊    | 583/1010 [1:15:10<1:02:45,  8.82s/pipeline]

Generation 57 - Current best internal CV score: 5.706547087479816


Optimization Progress:  59%|█████▊    | 593/1010 [1:15:40<1:04:15,  9.25s/pipeline]

Generation 58 - Current best internal CV score: 5.706547087479816


Optimization Progress:  60%|█████▉    | 604/1010 [1:20:41<10:54:27, 96.72s/pipeline]

Generation 59 - Current best internal CV score: 5.706547087479816


Optimization Progress:  61%|██████    | 614/1010 [1:21:29<4:45:08, 43.20s/pipeline] 

Generation 60 - Current best internal CV score: 5.756633982039264


Optimization Progress:  62%|██████▏   | 624/1010 [1:22:06<3:05:57, 28.91s/pipeline]

Generation 61 - Current best internal CV score: 5.756633982039264


Optimization Progress:  63%|██████▎   | 634/1010 [1:22:40<2:13:13, 21.26s/pipeline]

Generation 62 - Current best internal CV score: 5.756633982039264


Optimization Progress:  64%|██████▍   | 644/1010 [1:23:59<2:44:08, 26.91s/pipeline]

Generation 63 - Current best internal CV score: 5.756633982039264


Optimization Progress:  65%|██████▍   | 654/1010 [1:24:11<1:34:07, 15.86s/pipeline]

Generation 64 - Current best internal CV score: 5.756633982039264


Optimization Progress:  66%|██████▌   | 665/1010 [1:29:16<9:48:39, 102.37s/pipeline] 

Generation 65 - Current best internal CV score: 5.756633982039264


Optimization Progress:  67%|██████▋   | 675/1010 [1:29:35<3:38:12, 39.08s/pipeline] 

Generation 66 - Current best internal CV score: 5.756633982039264


Optimization Progress:  68%|██████▊   | 685/1010 [1:29:52<2:55:32, 32.41s/pipeline]

Generation 67 - Current best internal CV score: 5.756633982039264


Optimization Progress:  69%|██████▉   | 695/1010 [1:30:04<1:04:53, 12.36s/pipeline]

Generation 68 - Current best internal CV score: 5.756633982039264


Optimization Progress:  70%|██████▉   | 705/1010 [1:30:12<35:14,  6.93s/pipeline]  

Generation 69 - Current best internal CV score: 5.770793188846037


Optimization Progress:  71%|███████   | 715/1010 [1:30:31<50:35, 10.29s/pipeline]

Generation 70 - Current best internal CV score: 5.770793188846037


Optimization Progress:  72%|███████▏  | 725/1010 [1:31:02<47:21,  9.97s/pipeline]  

Generation 71 - Current best internal CV score: 5.770793188846037


Optimization Progress:  73%|███████▎  | 735/1010 [1:31:11<26:49,  5.85s/pipeline]

Generation 72 - Current best internal CV score: 5.770793188846037


Optimization Progress:  74%|███████▍  | 745/1010 [1:32:22<1:52:05, 25.38s/pipeline]

Generation 73 - Current best internal CV score: 5.770793188846037


Optimization Progress:  75%|███████▍  | 756/1010 [1:37:25<1:19:33, 18.79s/pipeline]

Generation 74 - Current best internal CV score: 5.770793188846037


Optimization Progress:  76%|███████▌  | 767/1010 [1:42:27<7:00:26, 103.81s/pipeline]

Generation 75 - Current best internal CV score: 5.770793188846037


Optimization Progress:  77%|███████▋  | 777/1010 [1:42:49<2:36:04, 40.19s/pipeline] 

Generation 76 - Current best internal CV score: 5.770793188846037


Optimization Progress:  78%|███████▊  | 787/1010 [1:42:56<1:15:54, 20.42s/pipeline]

Generation 77 - Current best internal CV score: 5.774892388124593


Optimization Progress:  79%|███████▉  | 797/1010 [1:43:13<47:59, 13.52s/pipeline]  

Generation 78 - Current best internal CV score: 5.774892388124593


Optimization Progress:  80%|███████▉  | 807/1010 [1:43:33<36:41, 10.85s/pipeline]

Generation 79 - Current best internal CV score: 5.774892388124593


Optimization Progress:  81%|████████  | 817/1010 [1:43:56<32:23, 10.07s/pipeline]

Generation 80 - Current best internal CV score: 5.774892388124593


Optimization Progress:  82%|████████▏ | 827/1010 [1:44:12<35:56, 11.78s/pipeline]

Generation 81 - Current best internal CV score: 5.774892388124593


Optimization Progress:  83%|████████▎ | 837/1010 [1:44:27<23:10,  8.04s/pipeline]

Generation 82 - Current best internal CV score: 5.774892388124593


Optimization Progress:  84%|████████▍ | 847/1010 [1:44:43<23:11,  8.54s/pipeline]

Generation 83 - Current best internal CV score: 5.774892388124593


Optimization Progress:  85%|████████▍ | 857/1010 [1:45:00<17:09,  6.73s/pipeline]

Generation 84 - Current best internal CV score: 5.774892388124593


Optimization Progress:  86%|████████▌ | 867/1010 [1:45:13<12:36,  5.29s/pipeline]

Generation 85 - Current best internal CV score: 5.8496480254902234


Optimization Progress:  87%|████████▋ | 877/1010 [1:45:32<12:46,  5.76s/pipeline]

Generation 86 - Current best internal CV score: 5.8496480254902234


Optimization Progress:  88%|████████▊ | 888/1010 [1:50:44<2:18:34, 68.15s/pipeline]

Generation 87 - Current best internal CV score: 5.8496480254902234


Optimization Progress:  89%|████████▉ | 899/1010 [1:55:46<4:16:09, 138.46s/pipeline]

Generation 88 - Current best internal CV score: 5.8496480254902234


Optimization Progress:  90%|█████████ | 909/1010 [1:56:02<1:25:26, 50.76s/pipeline] 

Generation 89 - Current best internal CV score: 6.055725391258284


Optimization Progress:  91%|█████████ | 919/1010 [1:56:20<57:59, 38.24s/pipeline]  

Generation 90 - Current best internal CV score: 6.055725391258284


Optimization Progress:  92%|█████████▏| 929/1010 [1:56:41<23:26, 17.37s/pipeline]

Generation 91 - Current best internal CV score: 6.055725391258284


Optimization Progress:  93%|█████████▎| 939/1010 [1:57:00<21:13, 17.94s/pipeline]

Generation 92 - Current best internal CV score: 6.055725391258284


Optimization Progress:  94%|█████████▍| 949/1010 [1:57:27<09:03,  8.91s/pipeline]

Generation 93 - Current best internal CV score: 6.055725391258284


Optimization Progress:  95%|█████████▍| 959/1010 [1:57:49<10:55, 12.86s/pipeline]

Generation 94 - Current best internal CV score: 6.055725391258284


Optimization Progress:  96%|█████████▌| 969/1010 [1:58:04<05:53,  8.63s/pipeline]

Generation 95 - Current best internal CV score: 6.055725391258284


Optimization Progress:  97%|█████████▋| 979/1010 [1:58:27<03:56,  7.63s/pipeline]

Generation 96 - Current best internal CV score: 6.177760265484106


Optimization Progress:  98%|█████████▊| 989/1010 [1:58:58<03:34, 10.24s/pipeline]

Generation 97 - Current best internal CV score: 6.177760265484106


Optimization Progress:  99%|█████████▉| 999/1010 [1:59:23<02:00, 10.96s/pipeline]

Generation 98 - Current best internal CV score: 6.289576230965715


Optimization Progress: 100%|█████████▉| 1009/1010 [2:00:00<00:11, 11.33s/pipeline]

Generation 99 - Current best internal CV score: 6.289576230965715


                                                                                  

Generation 100 - Current best internal CV score: 6.289576230965715

Best pipeline: XGBRegressor(FastICA(PCA(input_matrix, iterated_power=2, svd_solver=randomized), tol=0.45), learning_rate=0.001, max_depth=7, min_child_weight=1, n_estimators=100, nthread=1, subsample=0.05)


TPOTRegressor(config_dict={'sklearn.linear_model.ElasticNetCV': {'l1_ratio': array([ 0.  ,  0.05,  0.1 ,  0.15,  0.2 ,  0.25,  0.3 ,  0.35,  0.4 ,
        0.45,  0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,
        0.9 ,  0.95,  1.  ]), 'tol': [1e-05, 0.0001, 0.001, 0.01, 0.1]}, 'sklearn.ensemble.ExtraT....45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ])}}}},
       crossover_rate=0.1, cv=5, disable_update_check=False,
       early_stop=None, generations=100, max_eval_time_mins=5,
       max_time_mins=None, mutation_rate=0.9, n_jobs=4, offspring_size=10,
       periodic_checkpoint_folder=None, population_size=10,
       random_state=26, scoring=None, subsample=1.0, verbosity=2,
       warm_start=False)

In [67]:
print(tpot.score(X_test, y_test))

5.99740906298


In [42]:
tpot.export('tpot_exported_pipeline.py')

True

In [43]:
tpot.evaluated_individuals_

{'DecisionTreeRegressor(DecisionTreeRegressor(input_matrix, DecisionTreeRegressor__max_depth=3, DecisionTreeRegressor__min_samples_leaf=9, DecisionTreeRegressor__min_samples_split=13), DecisionTreeRegressor__max_depth=9, DecisionTreeRegressor__min_samples_leaf=9, DecisionTreeRegressor__min_samples_split=7)': (2,
  -1656784604.539309),
 'DecisionTreeRegressor(LinearSVR(input_matrix, LinearSVR__C=0.1, LinearSVR__dual=True, LinearSVR__epsilon=0.001, LinearSVR__loss=squared_epsilon_insensitive, LinearSVR__tol=0.01), DecisionTreeRegressor__max_depth=6, DecisionTreeRegressor__min_samples_leaf=4, DecisionTreeRegressor__min_samples_split=11)': (2,
  -1206903849.5031352),
 'DecisionTreeRegressor(LinearSVR(input_matrix, LinearSVR__C=0.5, LinearSVR__dual=True, LinearSVR__epsilon=0.001, LinearSVR__loss=squared_epsilon_insensitive, LinearSVR__tol=1e-05), DecisionTreeRegressor__max_depth=10, DecisionTreeRegressor__min_samples_leaf=16, DecisionTreeRegressor__min_samples_split=3)': (2,
  -1216302454.1

In [129]:
tpot.fitted_pipeline_

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power=2, n_components=None, random_state=None,
  svd_solver='randomized', tol=0.0, whiten=False)), ('fastica', FastICA(algorithm='parallel', fun='logcosh', fun_args=None, max_iter=200,
    n_components=None, random_state=None, tol=0.45, w_init=None,
    whiten=...inear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.05))])

### predict on test set

In [122]:
sales_pred = tpot.predict(test_array)

Imputing missing values in feature set


In [128]:
# Create the csv file for submission
sales_pred_df = pd.DataFrame({'Id': test_df.index, 'SalePrice': sales_pred})
sales_pred_df.to_csv('/var/data/tpot_sales_pred.csv', index = False)