In [1]:
import pandas as pd
import numpy as np
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, OneHotEncoder

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

Use IDs as index

In [3]:
train_df.set_index('Id', inplace=True, verify_integrity=True)
test_df.set_index('Id', inplace=True, verify_integrity=True)

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-

In [25]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-

In [59]:
train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [60]:
train_df[train_df.Electrical.isnull()]

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1380,80,RL,73.0,9735,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,5,2008,WD,Normal,167500


In [12]:
train_df.LotFrontage.describe()

count    1201.000000
mean       70.049958
std        24.284752
min        21.000000
25%        59.000000
50%        69.000000
75%        80.000000
max       313.000000
Name: LotFrontage, dtype: float64

In [26]:
train_df.loc[train_df.MasVnrArea == 0.0, ['MasVnrType', 'MasVnrArea']].head()

Unnamed: 0_level_0,MasVnrType,MasVnrArea
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,,0
4,,0
6,,0
9,,0
10,,0


In [25]:
train_df.loc[train_df.MasVnrArea.isnull(), ['MasVnrType', 'MasVnrArea']].head()

Unnamed: 0_level_0,MasVnrType,MasVnrArea
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
235,,
530,,
651,,
937,,
974,,


In [18]:
train_df.loc[train_df.PoolQC.isnull(), ['PoolQC', 'PoolArea']].head()

Unnamed: 0_level_0,PoolQC,PoolArea
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,,0
2,,0
3,,0
4,,0
5,,0


In [36]:
train_df.loc[train_df.MiscFeature.isnull(), ['MiscFeature', 'MiscValue']].head()

Unnamed: 0,MiscFeature,MiscValue
0,,
1,,
2,,
3,,
4,,


In [21]:
train_df.loc[~train_df.GarageType.isnull(), ['GarageType', 'GarageYrBlt', 'GarageSize']].head()

Unnamed: 0_level_0,GarageType,GarageYrBlt,GarageSize
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Attchd,2003,
2,Attchd,1976,
3,Attchd,2001,
4,Detchd,1998,
5,Attchd,2000,


### Cleaning

The target variable has to be called **class** (tpot internal constraint)

In [4]:
train_df.rename(columns={'SalePrice': 'class'}, inplace=True)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-

#### Missing values imputation

From the data description file

> LotFrontage: Linear feet of street connected to property

|Stat    |Value       |
|--------|------------|
|count   | 1201.000000|
|mean    |   70.049958|
|std     |   24.284752|
|min     |   21.000000|
|25%     |   59.000000|
|50%     |   69.000000|
|75%     |   80.000000|
|max     |  313.000000|

Name: LotFrontage, dtype: float64

> Alley: Type of alley access to property

       Grvl	Gravel
       Pave	Paved
       NA 	No alley access
       
> BsmtQual: Evaluates the height of the basement

       Ex	Excellent (100+ inches)	
       Gd	Good (90-99 inches)
       TA	Typical (80-89 inches)
       Fa	Fair (70-79 inches)
       Po	Poor (<70 inches
       NA	No Basement
       
> BsmtCond: Evaluates the general condition of the basement

       Ex	Excellent
       Gd	Good
       TA	Typical - slight dampness allowed
       Fa	Fair - dampness or some cracking or settling
       Po	Poor - Severe cracking, settling, or wetness
       NA	No Basement
	
> BsmtExposure: Refers to walkout or garden level walls

       Gd	Good Exposure
       Av	Average Exposure (split levels or foyers typically score average or above)	
       Mn	Mimimum Exposure
       No	No Exposure
       NA	No Basement
	
> BsmtFinType1: Rating of basement finished area

       GLQ	Good Living Quarters
       ALQ	Average Living Quarters
       BLQ	Below Average Living Quarters	
       Rec	Average Rec Room
       LwQ	Low Quality
       Unf	Unfinshed
       NA	No Basement
       
> BsmtFinType2: Rating of basement finished area (if multiple types)

       GLQ	Good Living Quarters
       ALQ	Average Living Quarters
       BLQ	Below Average Living Quarters	
       Rec	Average Rec Room
       LwQ	Low Quality
       Unf	Unfinshed
       NA	No Basement

> FireplaceQu: Fireplace quality

       Ex	Excellent - Exceptional Masonry Fireplace
       Gd	Good - Masonry Fireplace in main level
       TA	Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
       Fa	Fair - Prefabricated Fireplace in basement
       Po	Poor - Ben Franklin Stove
       NA	No Fireplace

> GarageType: Garage location
		
       2Types	More than one type of garage
       Attchd	Attached to home
       Basment	Basement Garage
       BuiltIn	Built-In (Garage part of house - typically has room above garage)
       CarPort	Car Port
       Detchd	Detached from home
       NA	No Garage
		
> GarageFinish: Interior finish of the garage

       Fin	Finished
       RFn	Rough Finished	
       Unf	Unfinished
       NA	No Garage
		
> GarageQual: Garage quality

       Ex	Excellent
       Gd	Good
       TA	Typical/Average
       Fa	Fair
       Po	Poor
       NA	No Garage
		
> GarageCond: Garage condition

       Ex	Excellent
       Gd	Good
       TA	Typical/Average
       Fa	Fair
       Po	Poor
       NA	No Garage
       
> PoolQC: Pool quality
		
       Ex	Excellent
       Gd	Good
       TA	Average/Typical
       Fa	Fair
       NA	No Pool
		
> Fence: Fence quality
		
       GdPrv	Good Privacy
       MnPrv	Minimum Privacy
       GdWo	Good Wood
       MnWw	Minimum Wood/Wire
       NA	No Fence
	
> MiscFeature: Miscellaneous feature not covered in other categories
		
       Elev	Elevator
       Gar2	2nd Garage (if not described in garage section)
       Othr	Other
       Shed	Shed (over 100 SF)
       TenC	Tennis Court
       NA	None

In [5]:
train_df.LotFrontage.fillna(value=0.0, inplace=True)
train_df.Alley.fillna(value='No alley', inplace=True)
train_df[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']] = train_df[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']].fillna(value='No basement')
train_df.FireplaceQu.fillna(value='No fireplace', inplace=True)
train_df[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']] = train_df[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']].fillna(value='No garage')
train_df.PoolQC.fillna(value='No pool', inplace=True)
train_df.Fence.fillna(value='No fence', inplace=True)
train_df.MiscFeature.fillna(value='No extra', inplace=True)
# train_df.MiscVal.fillna(value=0.0, inplace=True)

train_df.MasVnrType.fillna(value='Unknown', inplace=True)
train_df.drop(['MasVnrArea', 'GarageYrBlt', 'MiscVal'], axis=1, inplace=True)

In [6]:
train_df[['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 14 columns):
Alley           1460 non-null object
BsmtQual        1460 non-null object
BsmtCond        1460 non-null object
BsmtExposure    1460 non-null object
BsmtFinType1    1460 non-null object
BsmtFinType2    1460 non-null object
GarageType      1460 non-null object
GarageFinish    1460 non-null object
GarageQual      1460 non-null object
GarageCond      1460 non-null object
PoolQC          1460 non-null object
Fence           1460 non-null object
MiscFeature     1460 non-null object
MiscVal         1460 non-null int64
dtypes: int64(1), object(13)
memory usage: 171.1+ KB


In [7]:
print sum(pd.isnull(train_df.MiscVal))
print sum(pd.isnull(train_df.MiscFeature))

0
0


In [8]:
print sum(np.isnan(train_df.MiscVal))

0


In [9]:
train_df.loc[train_df.MiscFeature.isin(['No extra']) , ['MiscFeature', 'MiscValue']].head()

Unnamed: 0_level_0,MiscFeature,MiscValue
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,No extra,
2,No extra,
3,No extra,
4,No extra,
5,No extra,


In [33]:
sum(pd.isnull(train_df).any())

1

In [32]:
pd.isnull(train_df.Electrical).any()

True

In [34]:
train_df.Electrical.describe()

count      1459
unique        5
top       SBrkr
freq       1334
Name: Electrical, dtype: object

In [42]:
train_df.loc[train_df.Electrical.isnull()].index

Int64Index([1380], dtype='int64', name=u'Id')

In [44]:
pd.isnull(train_df.drop(train_df.loc[train_df.Electrical.isnull()].index, inplace=False)['Electrical'].any())

False

Remove one line with missing value for variable _Electrical_

In [6]:
train_df.drop(train_df.loc[train_df.Electrical.isnull()].index, inplace=True)

In [46]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1 to 1460
Data columns (total 77 columns):
MSSubClass       1459 non-null int64
MSZoning         1459 non-null object
LotFrontage      1459 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            1459 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1459 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1459 non-null object
Exterior2nd      1459 no

### Transform categorical variables into numerical

#### Multi Label Binariser
Transform categorical variables into one-hot-encoded variables

In [7]:
mlb = MultiLabelBinarizer()
MSZoning_ = mlb.fit_transform([{str(val)} for val in train_df['MSZoning'].values])
print MSZoning_.shape

(1459, 5)


In [8]:
MSZoning_

array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       ..., 
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]])

# Multi label binariser should be done on each categorical variable, ie variables of type object that are not binary

#### Label Binariser

In [36]:
lb = LabelBinarizer()
Street_ = lb.fit_transform(train_df.Street)
print Street_.shape

(1459, 1)


#### One Hot Encoder
Transform numerical variables into one-hot-encoded variables

In [38]:
ohe = OneHotEncoder()
MSSubClass_ = ohe.fit_transform(train_df.MSSubClass.reshape(-1, 1))
print MSSubClass_.shape

(1459, 15)


In [51]:
OverallQual_ = ohe.fit_transform(train_df.OverallQual.reshape(-1, 1))
print OverallQual_.shape

(1459, 10)


In [52]:
OverallCond_ = ohe.fit_transform(train_df.OverallCond.reshape(-1, 1))
print OverallCond_.shape

(1459, 9)


In [54]:
# 10 is missing /!\
train_df.OverallCond.unique()

array([5, 8, 6, 7, 4, 2, 3, 9, 1])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['SalePrice'], axis=1), train_df.SalePrice,
                                                    train_size=0.95, test_size=0.05)

tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2)
tpot.fit(X_train, y_train)
# print(tpot.score(X_test, y_test))

# About the warning
# https://stackoverflow.com/questions/41238769/warning-messages-when-using-python
# https://github.com/rhiever/tpot/issues/284

ValueError: could not convert string to float: Shed