In [2]:
library('lattice')
library('splines')
library('parallel')
library('survival')
library('ggplot2')
library('caret')
library('data.table')
library('Metrics')
library('MASS')
library('e1071')
library('kernlab')
library('gbm')
library('plyr')

In [3]:
train = fread('~/kaggle/house_prices/data/train.csv',
              colClasses=c('MiscFeature'='character','PoolQC'='character','Alley'='character'))

# Rename columns 1stFlrSF, 2ndFlrSF, and 3SsnPorch
FirstFlrSF=train$'1stFlrSF'
SecondFlrSF=train$'2ndFlrSF'
ThreeSsnPorch=train$'3SsnPorch'
new_names = names(train)[-which(names(train)=='1stFlrSF'|names(train)=='2ndFlrSF'|names(train)=='3SsnPorch')]
to_add = data.table(FirstFlrSF,SecondFlrSF,ThreeSsnPorch)
train = cbind(train[,new_names,with=FALSE],to_add)

# Transform categorical arguments KitchenQual, ExterQual, BsmtQual, GarageFinish, into numerical

# KitchenQual
nKitchenQual = numeric(length(train$KitchenQual))
nKitchenQual[train$KitchenQual=='TA']=1.0
nKitchenQual[train$KitchenQual=='Gd']=2.0
nKitchenQual[train$KitchenQual=='Ex']=3.0
train=cbind(train,nKitchenQual)

# ExterQual
nExterQual = numeric(length(train$ExterQual))
nExterQual[train$ExterQual=='TA']=1.0
nExterQual[train$ExterQual=='Gd']=2.0
nExterQual[train$ExterQual=='Ex']=3.0
train=cbind(train,nExterQual)

# BsmtQual
nBsmtQual = numeric(length(train$BsmtQual))
nBsmtQual[train$BsmtQual=='TA']=1.0
nBsmtQual[train$BsmtQual=='Gd']=2.0
nBsmtQual[train$BsmtQual=='Ex']=3.0
train=cbind(train,nBsmtQual)

# GarageFinish
nGarageFinish = numeric(length(train$GarageFinish))
nGarageFinish[train$GarageFinish=='Unf']=1.0
nGarageFinish[train$GarageFinish=='RFn']=2.0
nGarageFinish[train$GarageFinish=='Fin']=3.0
train=cbind(train,nGarageFinish)

# Full and half bathrooms
train$Bath = train$FullBath + train$HalfBath
train$BsmtBaths = train$BsmtFullBath + train$BsmtHalfBath

# TotalBsmtSF_on_GRLivArea (for SVR)
train$TotalBsmtSF_on_GrLivArea = train$TotalBsmtSF/train$GrLivArea

# MSSubClassCat
train$MSSubClassCat = train[,.(MSSubClassCat=sapply(MSSubClass,toString)),with=TRUE]

In [4]:
# load test file

test = fread('~/kaggle/house_prices/data/test.csv',
              colClasses=c('MiscFeature'='character','PoolQC'='character','Alley'='character'))

# Il faut renommer les colonnes 1stFlrSF, 2ndFlrSF, et 3SsnPorch pour pas avoir d'emmerdes
FirstFlrSF=test$'1stFlrSF'
SecondFlrSF=test$'2ndFlrSF'
ThreeSsnPorch=test$'3SsnPorch'
new_names = names(test)[-which(names(test)=='1stFlrSF'|names(test)=='2ndFlrSF'|names(test)=='3SsnPorch')]
to_add = data.table(FirstFlrSF,SecondFlrSF,ThreeSsnPorch)
test = cbind(test[,new_names,with=FALSE],to_add)

# Transform categorical arguments KitchenQual, ExterQual, BsmtQual, GarageFinish, into numerical

# KitchenQual
nKitchenQual = numeric(length(test$KitchenQual))
nKitchenQual[test$KitchenQual=='TA']=1.0
nKitchenQual[test$KitchenQual=='Gd']=2.0
nKitchenQual[test$KitchenQual=='Ex']=3.0
test=cbind(test,nKitchenQual)

# ExterQual
nExterQual = numeric(length(test$ExterQual))
nExterQual[test$ExterQual=='TA']=1.0
nExterQual[test$ExterQual=='Gd']=2.0
nExterQual[test$ExterQual=='Ex']=3.0
test=cbind(test,nExterQual)

# BsmtQual
nBsmtQual = numeric(length(test$BsmtQual))
nBsmtQual[test$BsmtQual=='TA']=1.0
nBsmtQual[test$BsmtQual=='Gd']=2.0
nBsmtQual[test$BsmtQual=='Ex']=3.0
test=cbind(test,nBsmtQual)

# GarageFinish
nGarageFinish = numeric(length(test$GarageFinish))
nGarageFinish[test$GarageFinish=='Unf']=1.0
nGarageFinish[test$GarageFinish=='RFn']=2.0
nGarageFinish[test$GarageFinish=='Fin']=3.0
test=cbind(test,nGarageFinish)

# Full and half bathrooms
test$Bath = test$FullBath + test$HalfBath
test$BsmtBaths = test$BsmtFullBath + test$BsmtHalfBath

# TotalBsmtSF_on_GrLivArea
test$TotalBsmtSF_on_GrLivArea = test$TotalBsmtSF/test$GrLivArea

# MSSubClassCat
test$MSSubClassCat = test[,.(MSSubClassCat=sapply(MSSubClass,toString)),with=TRUE]


In [5]:
# Deal with missing values
LotFrontage_mean = round(mean(train$LotFrontage,na.rm=TRUE))
train[which(is.na(LotFrontage)),'LotFrontage'] <- LotFrontage_mean
train=cbind(train,"IsGarage"=1+numeric(nrow(train)))
train[which(is.na(GarageYrBlt)),'GarageYrBlt'] <- 1900
#train[which(is.na(GarageQual)),'IsGarage'] <- 0
train[which(is.na(MasVnrArea)),'MasVnrArea'] <- 0
train[which(is.na(BsmtCond)),'BsmtCond'] <- 'MISSING'
train[which(is.na(BsmtFinType1)),'BsmtFinType1'] <- 'MISSING'
train[which(is.na(BsmtFinType2)),'BsmtFinType2'] <- 'MISSING'
train[which(is.na(BsmtFinSF1)),'BsmtFinSF1'] <- 0
train[which(is.na(BsmtFinSF2)),'BsmtFinSF2'] <- 0
train[which(is.na(TotalBsmtSF)),'TotalBsmtSF'] <- 0
train[which(is.na(GarageCars)),'GarageCars'] <- 0
train[which(is.na(GarageArea)),'GarageArea'] <- 0
train[which(is.na(BsmtUnfSF)),'BsmtUnfSF'] <- 0
train[which(is.na(BsmtFullBath)),'BsmtFullBath'] <- 0
train[which(is.na(BsmtHalfBath)),'BsmtHalfBath'] <- 0
train[which(is.na(MSZoning)),'MSZoning'] <- 'RL'
train[which(is.na(SaleType)),'SaleType'] <- 'Oth'
train[which(is.na(Exterior1st)),'Exterior1st'] <- 'Other'
train[which(is.na(Exterior2nd)),'Exterior2nd'] <- 'Other'
train[which(is.na(Functional)),'Functional'] <- 'Typ'

# create svr train set
kept_features_svr = c("LotArea","OverallQual","YearBuilt","YearRemodAdd","nKitchenQual","nExterQual",
                    "nBsmtQual","GrLivArea","Bath","nGarageFinish",
                    "BsmtFinSF1","GarageCars","TotalBsmtSF","KitchenAbvGr","BedroomAbvGr","TotRmsAbvGrd",
                    "OverallCond","TotalBsmtSF_on_GrLivArea")
train.kept_svr = train[,c(kept_features_svr,"SalePrice"),with=FALSE]


# create gbm train set
kept_num_features_gbm = c("LotFrontage", "LotArea", "OverallQual", "OverallCond",  
                        "YearBuilt", "YearRemodAdd", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
                        "TotalBsmtSF", "FirstFlrSF", "SecondFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath",
                        "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd",
                        "Fireplaces", "GarageYrBlt", "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF",
                        "EnclosedPorch", "ThreeSsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "MoSold",
                        "YrSold","SalePrice")
kept_cat_features_gbm = c("Neighborhood","ExterQual","HeatingQC","CentralAir","KitchenQual","SaleType",
                  "SaleCondition","IsGarage")
kept_features_gbm = c(kept_num_features_gbm,kept_cat_features_gbm)
# Separate numeric and categorical features for conversion (as numeric and factor)
train_gbm.sample.num_features = train[,kept_num_features_gbm,with=FALSE]
train_gbm.sample.cat_features = train[,kept_cat_features_gbm,with=FALSE]
# Change class of data and merge back numeric and categorical
train_gbm.sample.num_features.toFit = train_gbm.sample.num_features[,lapply(.SD,as.numeric)]
train_gbm.sample.cat_features.toFit = train_gbm.sample.cat_features[,lapply(.SD,as.factor)]
train.kept_gbm = cbind(train_gbm.sample.num_features.toFit,train_gbm.sample.cat_features.toFit)



In [6]:
# Deal with missing values
LotFrontage_mean = round(mean(test$LotFrontage,na.rm=TRUE))
test[which(is.na(LotFrontage)),'LotFrontage'] <- LotFrontage_mean
test=cbind(test,"IsGarage"=1+numeric(nrow(test)))
test[which(is.na(GarageYrBlt)),'GarageYrBlt'] <- 1900
#test[which(is.na(GarageQual)),'IsGarage'] <- 0
test[which(is.na(MasVnrArea)),'MasVnrArea'] <- 0
test[which(is.na(BsmtCond)),'BsmtCond'] <- 'MISSING'
test[which(is.na(BsmtFinType1)),'BsmtFinType1'] <- 'MISSING'
test[which(is.na(BsmtFinType2)),'BsmtFinType2'] <- 'MISSING'
test[which(is.na(BsmtFinSF1)),'BsmtFinSF1'] <- 0
test[which(is.na(BsmtFinSF2)),'BsmtFinSF2'] <- 0
test[which(is.na(TotalBsmtSF)),'TotalBsmtSF'] <- 0
test[which(is.na(GarageCars)),'GarageCars'] <- 0
test[which(is.na(GarageArea)),'GarageArea'] <- 0
test[which(is.na(BsmtUnfSF)),'BsmtUnfSF'] <- 0
test[which(is.na(BsmtFullBath)),'BsmtFullBath'] <- 0
test[which(is.na(BsmtHalfBath)),'BsmtHalfBath'] <- 0
test[which(is.na(MSZoning)),'MSZoning'] <- 'RL'
test[which(is.na(SaleType)),'SaleType'] <- 'Oth'
test[which(is.na(Exterior1st)),'Exterior1st'] <- 'Other'
test[which(is.na(Exterior2nd)),'Exterior2nd'] <- 'Other'
test[which(is.na(Functional)),'Functional'] <- 'Typ'
test$BsmtBaths = test$BsmtFullBath + test$BsmtHalfBath
test$TotalBsmtSF_on_GrLivArea = test$TotalBsmtSF/test$GrLivArea

# create svr test set
kept_features_svr = c("LotArea","OverallQual","YearBuilt","YearRemodAdd","nKitchenQual","nExterQual",
                   "nBsmtQual","GrLivArea","Bath","nGarageFinish",
                   "BsmtFinSF1","GarageCars","TotalBsmtSF","KitchenAbvGr","BedroomAbvGr","TotRmsAbvGrd","OverallCond",
                   "TotalBsmtSF_on_GrLivArea")
test.kept_svr = test[,c(kept_features_svr),with=FALSE]


# create gbm test set
kept_num_features_gbm = c("LotFrontage", "LotArea", "OverallQual", "OverallCond",  
                        "YearBuilt", "YearRemodAdd", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
                        "TotalBsmtSF", "FirstFlrSF", "SecondFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath",
                        "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd",
                        "Fireplaces", "GarageYrBlt", "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF",
                        "EnclosedPorch", "ThreeSsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "MoSold", "YrSold")
kept_cat_features_gbm = c("Neighborhood","ExterQual","HeatingQC","CentralAir","KitchenQual","SaleType",
                  "SaleCondition","IsGarage")
kept_features_gbm = c(kept_num_features_gbm,kept_cat_features_gbm)
# Separate numeric and categorical features for conversion (as numeric and factor)
test_gbm.sample.num_features = test[,kept_num_features_gbm,with=FALSE]
test_gbm.sample.cat_features = test[,kept_cat_features_gbm,with=FALSE]
# Change class of data and merge back numeric and categorical
test_gbm.sample.num_features.toPredict = test_gbm.sample.num_features[,lapply(.SD,as.numeric)]
test_gbm.sample.cat_features.toPredict = test_gbm.sample.cat_features[,lapply(.SD,as.factor)]
test.kept_gbm = cbind(test_gbm.sample.num_features.toPredict,test_gbm.sample.cat_features.toPredict)



In [7]:
# separate train set in k fols
k=5

set.seed(12)
folds=createFolds(train$SalePrice, k = k, list = TRUE)


list_rmsle = list()

In [8]:
# for each fold i, do: {fit a SVR on the k-1 other folds then predict values for the fold i}

bootControl <- trainControl(number = 1, verboseIter=TRUE)
tuneGrid = expand.grid(C=c(1.25),sigma=c(0.015)) # mandatory

svr.train.predicted = data.table(SalePrice=numeric(nrow(train.kept_svr)))

for (i in 1:k){ 
    # Remarque: en fait on n'a même pas besoin de stocker les modèles entraînés, car seules les 
    # prédictions nous intéressent.
    
    #train_fold_i = Reduce(c,folds[-i])
    #to_predict_fold_i = folds[[i]]
    train_i.sample = train.kept_svr[-folds[[i]],-"SalePrice",with=FALSE]
    train_i.target = train.kept_svr[-folds[[i]],.(SalePrice=as.numeric(SalePrice))]

    svrFit_i = train(x=train_i.sample,y=train_i.target$SalePrice,method='svmRadial',
                          trControl=bootControl, tuneGrid=tuneGrid, preProcess=c("center","scale"))
    
    # predict remaining fold
    
    svr.train.predicted[folds[[i]],"SalePrice"] = predict(svrFit_i$finalModel,
                            newdata=data.table(scale(train.kept_svr[folds[[i]],-"SalePrice",with=FALSE])))
    
    
}

list_rmsle["lvl1_svr"] = rmsle(svr.train.predicted$SalePrice,train$SalePrice)


+ Resample1: C=1.25, sigma=0.015 
- Resample1: C=1.25, sigma=0.015 
Aggregating results
Fitting final model on full training set
+ Resample1: C=1.25, sigma=0.015 
- Resample1: C=1.25, sigma=0.015 
Aggregating results
Fitting final model on full training set
+ Resample1: C=1.25, sigma=0.015 
- Resample1: C=1.25, sigma=0.015 
Aggregating results
Fitting final model on full training set
+ Resample1: C=1.25, sigma=0.015 
- Resample1: C=1.25, sigma=0.015 
Aggregating results
Fitting final model on full training set
+ Resample1: C=1.25, sigma=0.015 
- Resample1: C=1.25, sigma=0.015 
Aggregating results
Fitting final model on full training set


In [9]:
# for each fold i, do: {fit a gbm model on the k-1 other folds then predict values for the fold i}

bootControl <- trainControl(number = 1, verboseIter=TRUE)
gbmGrid = expand.grid(interaction.depth = 4,n.trees = c(1950),shrinkage=c(.03),
                      n.minobsinnode=10)

gbm.train.predicted = data.table(SalePrice=numeric(nrow(train.kept_gbm)))

#gbmFit_i <- 0
#gc(verbose=TRUE)

for (i in 1:k){ 
  
    train_i.sample = train.kept_gbm[-folds[[i]],-"SalePrice",with=FALSE]
    train_i.target = train.kept_gbm$SalePrice[-folds[[i]]]

    gbmFit_i <- train(train_i.sample,train_i.target,method='gbm',trControl=bootControl,verbose=TRUE,
               bag.fraction=.7,tuneGrid=gbmGrid,metric='RMSE')
    
    # predict remaining fold
    gbm.train.predicted[folds[[i]],"SalePrice"] = predict(gbmFit_i$finalModel,
                            newdata=train.kept_gbm[folds[[i]],-"SalePrice",with=FALSE],n.trees=1950)
    
    #gbmFit_i <- 0
    #gc(verbose=TRUE) 
}

list_rmsle["lvl1_gbm"] = rmsle(gbm.train.predicted$SalePrice,train$SalePrice)


+ Resample1: interaction.depth=4, n.trees=1950, shrinkage=0.03, n.minobsinnode=10 


“variable 42: IsGarage has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1 5810336449.2092            -nan     0.0300 257851114.3712
     2 5569646823.3623            -nan     0.0300 200148686.9633
     3 5344721392.8905            -nan     0.0300 186292525.0289
     4 5133691894.1895            -nan     0.0300 199805173.4710
     5 4932255825.4953            -nan     0.0300 199498849.2568
     6 4748146794.5779            -nan     0.0300 180688960.6411
     7 4576707695.4708            -nan     0.0300 213701400.8039
     8 4405643900.3019            -nan     0.0300 172639158.5294
     9 4238858376.5337            -nan     0.0300 139763627.3811
    10 4084053716.9047            -nan     0.0300 140306152.9081
    20 2887697992.3270            -nan     0.0300 83001774.7788
    40 1602556545.6367            -nan     0.0300 37350682.4933
    60 1027493618.4318            -nan     0.0300 4877610.9740
    80 745486910.8991            -nan     0.0300 7137798.1658
   100 586312167.6229            -nan  

“variable 42: IsGarage has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1 6103901876.9887            -nan     0.0300 203078610.1129
     2 5865061428.4061            -nan     0.0300 243872197.4312
     3 5634407573.0043            -nan     0.0300 237421027.5996
     4 5429560640.5071            -nan     0.0300 230948857.8833
     5 5223681777.2208            -nan     0.0300 214029970.1356
     6 5030292385.8148            -nan     0.0300 207770219.8860
     7 4841872168.0254            -nan     0.0300 168368248.8247
     8 4665019722.4218            -nan     0.0300 152118062.7142
     9 4485746185.8451            -nan     0.0300 173977170.0109
    10 4331180637.4336            -nan     0.0300 162586315.3382
    20 3086187879.0506            -nan     0.0300 85139806.3870
    40 1716315039.0322            -nan     0.0300 37990101.2571
    60 1116984505.9757            -nan     0.0300 18064448.1475
    80 823759314.8256            -nan     0.0300 5503358.2032
   100 663766912.9646            -nan 

“variable 42: IsGarage has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1 6292144144.5529            -nan     0.0300 196323013.8095
     2 6035522469.3094            -nan     0.0300 247003629.2903
     3 5801647422.7586            -nan     0.0300 242615831.1801
     4 5578806036.9292            -nan     0.0300 189242322.7584
     5 5365821702.0723            -nan     0.0300 213670903.6558
     6 5168092003.9208            -nan     0.0300 203041682.0724
     7 4970789353.7252            -nan     0.0300 172928123.8769
     8 4788178709.0171            -nan     0.0300 149414665.6485
     9 4615241098.0606            -nan     0.0300 164447914.1675
    10 4444042556.1501            -nan     0.0300 193791223.9091
    20 3154182817.1368            -nan     0.0300 100687708.7672
    40 1727651508.1978            -nan     0.0300 40382473.9702
    60 1091219003.5302            -nan     0.0300 12009787.3343
    80 776939890.6728            -nan     0.0300 8684694.5610
   100 608551975.0688            -nan

“variable 42: IsGarage has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1 6192957329.3012            -nan     0.0300 189320009.3373
     2 5941209464.2482            -nan     0.0300 191728823.1172
     3 5709706180.9086            -nan     0.0300 211312348.6733
     4 5499605685.9346            -nan     0.0300 214319804.1542
     5 5295942254.9750            -nan     0.0300 179912512.2122
     6 5093232539.5621            -nan     0.0300 200039100.8187
     7 4903646041.6073            -nan     0.0300 211905442.6501
     8 4724314919.4308            -nan     0.0300 167531713.6349
     9 4559798571.0650            -nan     0.0300 181270772.8248
    10 4401132470.6015            -nan     0.0300 169221076.3057
    20 3154562947.6516            -nan     0.0300 86673621.9160
    40 1807170145.9119            -nan     0.0300 34827947.1227
    60 1192641965.2149            -nan     0.0300 19029357.7032
    80 894526913.4834            -nan     0.0300 8006273.2419
   100 730406804.1842            -nan 

“variable 42: IsGarage has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1 6693721385.1780            -nan     0.0300 291523422.0827
     2 6437958499.5407            -nan     0.0300 275982704.7210
     3 6171697528.6443            -nan     0.0300 212581594.9962
     4 5935929971.2843            -nan     0.0300 258216031.6086
     5 5704620369.1666            -nan     0.0300 267788562.7648
     6 5466339511.6726            -nan     0.0300 204837983.0177
     7 5251159588.6067            -nan     0.0300 166960164.0918
     8 5055427039.7782            -nan     0.0300 216457797.8744
     9 4857451171.6907            -nan     0.0300 137876032.7457
    10 4676589537.6539            -nan     0.0300 187174625.6336
    20 3248233625.0414            -nan     0.0300 126024838.5921
    40 1700294028.6668            -nan     0.0300 39054751.7533
    60 1007038942.7441            -nan     0.0300 21590489.0503
    80 675347636.1879            -nan     0.0300 10309199.7392
   100 503940429.3206            -na

“variable 42: IsGarage has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1 5974718393.1537            -nan     0.0300 181726370.2451
     2 5740976056.0238            -nan     0.0300 233148506.0861
     3 5526573029.4076            -nan     0.0300 252042900.8609
     4 5314216098.2658            -nan     0.0300 183131461.3692
     5 5108617605.6786            -nan     0.0300 168113535.9328
     6 4907783119.5034            -nan     0.0300 176088811.0325
     7 4715024671.9258            -nan     0.0300 206149700.4770
     8 4546879043.7598            -nan     0.0300 156667888.7135
     9 4382865497.4281            -nan     0.0300 165796549.3979
    10 4213579982.1676            -nan     0.0300 162735555.3459
    20 2963167206.3142            -nan     0.0300 91087592.4987
    40 1636915710.0997            -nan     0.0300 31695094.3646
    60 1025937078.3887            -nan     0.0300 15837569.5009
    80 737373381.3678            -nan     0.0300 7334063.9762
   100 574250588.3723            -nan 

“variable 42: IsGarage has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1 6127669306.7233            -nan     0.0300 270658316.9586
     2 5884037981.0236            -nan     0.0300 221272726.0713
     3 5643682446.9613            -nan     0.0300 262283667.1650
     4 5425391029.6373            -nan     0.0300 209702079.9108
     5 5211631530.3306            -nan     0.0300 186066150.2362
     6 5013487051.3070            -nan     0.0300 204187846.9399
     7 4836402336.8837            -nan     0.0300 207745810.4961
     8 4662172836.9280            -nan     0.0300 173417664.3879
     9 4485313824.3496            -nan     0.0300 154177204.6590
    10 4331864732.8186            -nan     0.0300 161560623.7689
    20 3060180438.5037            -nan     0.0300 107369684.1441
    40 1731332573.4229            -nan     0.0300 39045300.6297
    60 1132731358.3587            -nan     0.0300 19297619.9665
    80 841373859.9938            -nan     0.0300 9859685.7046
   100 679532947.7868            -nan

“variable 42: IsGarage has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1 5763154992.7380            -nan     0.0300 233593763.8262
     2 5538451528.0580            -nan     0.0300 220683989.0378
     3 5332848899.5911            -nan     0.0300 198684349.4863
     4 5130957586.4232            -nan     0.0300 197770659.0149
     5 4948600050.4678            -nan     0.0300 180237561.3619
     6 4767139448.7794            -nan     0.0300 152664873.0885
     7 4590427419.1371            -nan     0.0300 141022103.5540
     8 4421816395.8128            -nan     0.0300 142009740.1900
     9 4265047428.3487            -nan     0.0300 110864543.6615
    10 4125495699.5989            -nan     0.0300 167870301.8548
    20 2961050761.1038            -nan     0.0300 79397102.4276
    40 1734071047.9861            -nan     0.0300 40261086.5927
    60 1156145876.3107            -nan     0.0300 12861909.7359
    80 868705079.9228            -nan     0.0300 8917346.7499
   100 712498557.0093            -nan 

“variable 42: IsGarage has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1 6098938673.3796            -nan     0.0300 235495692.4617
     2 5864442581.1984            -nan     0.0300 239342965.4288
     3 5637040191.4262            -nan     0.0300 222329408.1618
     4 5426910268.6806            -nan     0.0300 189811493.7211
     5 5205885679.4004            -nan     0.0300 205497805.4317
     6 5016786428.5105            -nan     0.0300 155743389.3832
     7 4834924646.4785            -nan     0.0300 185069601.5773
     8 4662906611.2384            -nan     0.0300 158948528.3626
     9 4493554124.9903            -nan     0.0300 187867374.8845
    10 4338694534.5371            -nan     0.0300 149978283.5465
    20 3086849807.0742            -nan     0.0300 82443577.6396
    40 1726996851.8405            -nan     0.0300 21798517.4909
    60 1103847516.6831            -nan     0.0300 14968828.2599
    80 804189438.3004            -nan     0.0300 7303677.2693
   100 640381545.2050            -nan 

“variable 42: IsGarage has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1 6247721511.3842            -nan     0.0300 256320305.9298
     2 6009888372.8056            -nan     0.0300 248718294.8183
     3 5779242585.8929            -nan     0.0300 238282394.3844
     4 5575011214.2241            -nan     0.0300 258611085.4690
     5 5360324618.8467            -nan     0.0300 214612283.5449
     6 5161704513.9616            -nan     0.0300 180378746.4400
     7 4972900114.2771            -nan     0.0300 181319056.6627
     8 4800982786.4093            -nan     0.0300 170554913.7595
     9 4634227972.2618            -nan     0.0300 132920862.2275
    10 4471894691.9773            -nan     0.0300 146319256.0598
    20 3199822172.3936            -nan     0.0300 76064757.9749
    40 1836583988.5393            -nan     0.0300 42609063.6158
    60 1199911210.2884            -nan     0.0300 11923226.3141
    80 896008540.4893            -nan     0.0300 7697892.5615
   100 722677036.9332            -nan 

In [22]:
# create train set for lvl2 lm

train_2.sample = data.table(preds_svr=svr.train.predicted$SalePrice,
                            preds_gbm=gbm.train.predicted$SalePrice,SalePrice=train$SalePrice)

set.seed(10)
train_a_part = createDataPartition(train$SalePrice,p=.80,list=FALSE)

In [23]:
head(train_2.sample)

Unnamed: 0,preds_svr,preds_gbm,SalePrice
1,214535.3,202971.9,208500.0
2,179338.2,183930.8,181500.0
3,224137.5,213603.6,223500.0
4,156926.8,187713.0,140000.0
5,305793.0,318363.5,250000.0
6,162479.8,150652.4,143000.0


In [24]:
# fit lvl2 lm

# grid search result: n.trees = 205, interaction.depth = 1, shrinkage = 0.05, n.minobsinnode = 12
# for the following search:
#gbmGrid <- expand.grid(interaction.depth = (1:3),n.trees = (30:50)*5, 
#                       shrinkage = c(.02,.03,.04,.05,.06,.07,.08),n.minobsinnode = (5:20))

# ----------------------------------
## checking rmsle score on a fold

#bootControl <- trainControl(number = 10, verboseIter=TRUE)
#lmFit = train(SalePrice ~ preds_svr+preds_gbm-1,method='lm',data=train_2.sample,
#              trControl=bootControl,metric="RMSE")
#lmFit.predict_b = predict(lmFit$finalModel,newdata=train_2.sample[-train_a_part])
#
#list_rmsle["lvl2_gbm"] = rmsle(train$SalePrice[-train_a_part],lmFit.predict_b)

# -----
# OR
# ----- 

# ----------------------------------
# training on full train set for predictions

#bootControl <- trainControl(number = 10, verboseIter=TRUE)
#lmFit = train(SalePrice ~ 0 + preds_svr + preds_gbm, method='lm',data=train_2.sample,
#              trControl=bootControl,metric="RMSE")

lmFit = lm(SalePrice ~ 0 + preds_svr + preds_gbm, data=train_2.sample)

+ Resample01: intercept=TRUE 
- Resample01: intercept=TRUE 
+ Resample02: intercept=TRUE 
- Resample02: intercept=TRUE 
+ Resample03: intercept=TRUE 
- Resample03: intercept=TRUE 
+ Resample04: intercept=TRUE 
- Resample04: intercept=TRUE 
+ Resample05: intercept=TRUE 
- Resample05: intercept=TRUE 
+ Resample06: intercept=TRUE 
- Resample06: intercept=TRUE 
+ Resample07: intercept=TRUE 
- Resample07: intercept=TRUE 
+ Resample08: intercept=TRUE 
- Resample08: intercept=TRUE 
+ Resample09: intercept=TRUE 
- Resample09: intercept=TRUE 
+ Resample10: intercept=TRUE 
- Resample10: intercept=TRUE 
Aggregating results
Fitting final model on full training set


In [15]:
# fit lvl1 models on full train set, and predict test set SalePrices


# -----------------------
# SVR

# train
bootControl <- trainControl(number = 1, verboseIter=TRUE)
tuneGrid = expand.grid(C=c(1.25),sigma=c(0.015)) # mandatory

train_svr.sample = train.kept_svr[,-"SalePrice",with=FALSE]
train_svr.target = train.kept_svr[,.(SalePrice=as.numeric(SalePrice))]

svrFit = train(x=train_svr.sample,y=train_svr.target$SalePrice,method='svmRadial',
                          trControl=bootControl, tuneGrid=tuneGrid, preProcess=c("center","scale"))
    
# predict
svr.test.preds = predict(svrFit$finalModel,
                        newdata=data.table(scale(test.kept_svr)))
    
    
# -----------------------
# gbm

# train
bootControl <- trainControl(number = 1, verboseIter=TRUE)
gbmGrid = expand.grid(interaction.depth=4,n.trees=c(1950),shrinkage=c(.03),n.minobsinnode=10)
  
train_gbm.sample = train.kept_gbm[,-"SalePrice",with=FALSE]
train_gbm.target = train.kept_gbm$SalePrice

gbmFit <- train(train_gbm.sample,train_gbm.target,method='gbm',trControl=bootControl,verbose=TRUE,
               bag.fraction=.7,tuneGrid=gbmGrid,metric='RMSE')
    
# predict
gbm.test.preds = predict(gbmFit$finalModel,
                            newdata=test.kept_gbm,n.trees=1950)


# -----------------------
# create train set for lvl2 gbm
test_2.sample = data.table(preds_svr=svr.test.preds,preds_gbm=gbm.test.preds)
setnames(test_2.sample,c("preds_svr","preds_gbm"))

+ Resample1: C=1.25, sigma=0.015 
- Resample1: C=1.25, sigma=0.015 
Aggregating results
Fitting final model on full training set


In [None]:
# predict with lvl2 model and write submission file

lmFit.test.preds = predict(lmFit,newdata=test_2.sample)

test.sample_submission = fread('~/kaggle/house_prices/data/sample_submission.csv')
test.sample_submission = test.sample_submission[,.(Id)]
test.sample_submission.new = cbind(test.sample_submission,SalePrice=lmFit.test.preds)

setnames(test.sample_submission.new,c("Id","SalePrice"))
write.csv(test.sample_submission.new,'~/kaggle/house_prices/data/linear_stacked_with_cst.csv',row.names=FALSE)

# leaderboard score 0.12677 with cte
# leaderboard score without cte: ??

In [None]:
# check lvl1 gbm leaderboard score

#test.sample_submission = fread('~/kaggle/house_prices/data/sample_submission.csv')
#test.sample_submission = test.sample_submission[,.(Id)]
#test.sample_submission.new = cbind(test.sample_submission,SalePrice=gbm.test.preds)
#write.csv(test.sample_submission.new,'~/kaggle/house_prices/data/gbm_test_submit.csv',row.names=FALSE)

# leaderboard score 0.12827

In [None]:
# check lvl1 svr leaderboard score

#test.sample_submission = fread('~/kaggle/house_prices/data/sample_submission.csv')
#test.sample_submission = test.sample_submission[,.(Id)]
#test.sample_submission.new = cbind(test.sample_submission,SalePrice=svr.test.preds)
#write.csv(test.sample_submission.new,'~/kaggle/house_prices/data/svr_test_submit.csv',row.names=FALSE)

# leaderboard score 0.13999,