In [1]:
library('lattice')
library('ggplot2')
library('caret')
library('data.table')
library('Metrics')
library('MASS')
library('e1071')
library('lars')
library('elasticnet')
library('survival')
library('penalized')

Loaded lars 1.2


Attaching package: ‘survival’

The following object is masked from ‘package:caret’:

    cluster

Welcome to penalized. For extended examples, see vignette("penalized").


In [2]:
train = fread('~/kaggle/house_prices/data/train.csv',
              colClasses=c('MiscFeature'='character','PoolQC'='character'))

# Il faut renommer les colonnes 1stFlrSF, 2ndFlrSF, et 3SsnPorch pour pas avoir d'emmerdes
FirstFlrSF=train$'1stFlrSF'
SecondFlrSF=train$'2ndFlrSF'
ThreeSsnPorch=train$'3SsnPorch'
new_names = names(train)[-which(names(train)=='1stFlrSF'|names(train)=='2ndFlrSF'|names(train)=='3SsnPorch')]
to_add = data.table(FirstFlrSF,SecondFlrSF,ThreeSsnPorch)
train = cbind(train[,new_names,with=FALSE],to_add)

# Transform categorical arguments KitchenQual, ExterQual, BsmtQual, GarageFinish, into numerical

# KitchenQual
nKitchenQual = numeric(length(train$KitchenQual))
nKitchenQual[train$KitchenQual=='TA']=1.0
nKitchenQual[train$KitchenQual=='Gd']=2.0
nKitchenQual[train$KitchenQual=='Ex']=3.0
train=cbind(train,nKitchenQual)

# ExterQual
nExterQual = numeric(length(train$ExterQual))
nExterQual[train$ExterQual=='TA']=1.0
nExterQual[train$ExterQual=='Gd']=2.0
nExterQual[train$ExterQual=='Ex']=3.0
train=cbind(train,nExterQual)

# BsmtQual
nBsmtQual = numeric(length(train$BsmtQual))
nBsmtQual[train$BsmtQual=='TA']=1.0
nBsmtQual[train$BsmtQual=='Gd']=2.0
nBsmtQual[train$BsmtQual=='Ex']=3.0
train=cbind(train,nBsmtQual)

# GarageFinish
nGarageFinish = numeric(length(train$GarageFinish))
nGarageFinish[train$GarageFinish=='Unf']=1.0
nGarageFinish[train$GarageFinish=='RFn']=2.0
nGarageFinish[train$GarageFinish=='Fin']=3.0
train=cbind(train,nGarageFinish)

# Full and half bathrooms
train$Bath = train$FullBath + train$HalfBath
train$BsmtBaths = train$BsmtFullBath + train$BsmtHalfBath

# Take log of SalePrice
train$log_SalePrice = train[,.(log_SalePrice=log(SalePrice))]

# Take log of a few features
train$log_LotArea = train[,.(log_LotArea=log(LotArea))]
train$log_OverallQual = train[,.(log_OverallQual=log(OverallQual))]
train$log_YearBuilt = train[,.(log_YearBuilt=log(YearBuilt))]
train$log_YearRemodAdd = train[,.(log_YearRemodAdd=log(YearRemodAdd))]
train$log_GrLivArea = train[,.(log_GrLivArea=log(GrLivArea))]
train$log_Bath = train[,.(log_Bath=log(1+Bath))]
train$log_nKitchenQual = train[,.(log_nKitchenQual=log(1+nKitchenQual))]
train$log_nBsmtQual = train[,.(log_nBsmtQual=log(1+nBsmtQual))]
train$log_nExterQual = train[,.(log_nExterQual=log(1+nExterQual))]
train$log_nGarageFinish = train[,.(log_nGarageFinish=log(1+nGarageFinish))]
train$log_BsmtFinSF1 = train[,.(log(1+BsmtFinSF1))]
train$log_GarageCars = train[,.(log(1+GarageCars))]
train$log_TotalBsmtSF = train[,.(log(1+TotalBsmtSF))]
train$log_KitchenAbvGr = train[,.(log(1+KitchenAbvGr))]
train$log_BedroomAbvGr = train[,.(log(1+BedroomAbvGr))]
train$log_TotRmsAbvGrd = train[,.(log(TotRmsAbvGrd))]
train$log_OverallCond = train[,.(log(OverallCond))]

# Try exp of a few features
# None useful

# Try powers of a few features
train$OverallQual_Square = train$OverallQual*train$OverallQual
train$OverallQual_3 = train$OverallQual*train$OverallQual*train$OverallQual
train$GrLivArea_Square = train$GrLivArea*train$GrLivArea
train$TotalBsmtSF_on_GrLivArea = train$TotalBsmtSF/train$GrLivArea
train$OverallCond_sqrt = sqrt(train$OverallCond)
train$OverallCond_square = train$OverallCond*train$OverallCond
train$LotArea_sqrt = sqrt(train$LotArea)
train$FirstFlrSF_sqrt = sqrt(train$FirstFlrSF)
train$TotRmsAbvGrd_sqrt = sqrt(train$TotRmsAbvGrd)

In [3]:
kept_features7 = c("LotArea","OverallQual","YearBuilt","YearRemodAdd","nKitchenQual","nExterQual",
                   "nBsmtQual","GrLivArea","Bath","nGarageFinish",
                  "log_LotArea","log_OverallQual","log_YearBuilt","log_YearRemodAdd","log_nKitchenQual","log_nExterQual",
                   "log_nBsmtQual","log_GrLivArea","log_Bath","log_nGarageFinish",
                   "BsmtFinSF1","GarageCars","TotalBsmtSF","KitchenAbvGr","BedroomAbvGr","TotRmsAbvGrd","OverallCond",
                  "log_BsmtFinSF1","log_GarageCars","log_TotalBsmtSF","log_KitchenAbvGr","log_BedroomAbvGr",
                   "log_TotRmsAbvGrd","log_OverallCond","OverallQual_Square","OverallQual_3",
                    "GrLivArea_Square","TotalBsmtSF_on_GrLivArea","OverallCond_sqrt",
                    "OverallCond_square","LotArea_sqrt","FirstFlrSF_sqrt","TotRmsAbvGrd_sqrt")
train.kept7 = train[,c(kept_features7,"log_SalePrice","SalePrice"),with=FALSE]

In [4]:
set.seed(4)
inTrain = createDataPartition(train.kept7$log_SalePrice,p=.75,list=FALSE)
#print(inTrain)
train.sample = train.kept7[inTrain,]
test.sample = train.kept7[-inTrain,]

In [33]:
ridgeGrid = expand.grid(lambda1 = c(0.05),lambda2 = c(.05))
bootControl <- trainControl(number = 25, verboseIter = TRUE)

In [34]:
lmFit7 = train(log_SalePrice ~ OverallQual +
               log_YearBuilt + 
    log_LotArea + log_BsmtFinSF1 + GarageCars + 
    nBsmtQual + nKitchenQual + TotalBsmtSF + log_KitchenAbvGr + 
    log_nBsmtQual + GrLivArea + log_BedroomAbvGr + log_GarageCars +
    TotalBsmtSF_on_GrLivArea + GrLivArea_Square +
    log_BedroomAbvGr + nKitchenQual,# + OverallQual_3,
              method='penalized',data=train.sample,metric="RMSE",trControl=bootControl,tuneGrid=ridgeGrid)
print(lmFit7)

+ Resample01: lambda1=0.05, lambda2=0.05 
# nonzero coefficients: 161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161516161616161616161616161616161616161616161616161616161616161616
- Resample01: lambda1=0.05, lambda2=0.05 
+ Resample02: lambda1=0.05, lambda2=0.05 
# nonzero coefficients: 161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161615151616151616151616161616
- Resample02

In [8]:
# .1 .1
lmFit7.prediction=exp(predict(lmFit7,newdata=test.sample))
rmsle(test.sample$SalePrice,lmFit7.prediction)

In [11]:
# 0 0
lmFit7.prediction=exp(predict(lmFit7,newdata=test.sample))
rmsle(test.sample$SalePrice,lmFit7.prediction)

In [14]:
# .1 .5
lmFit7.prediction=exp(predict(lmFit7,newdata=test.sample))
rmsle(test.sample$SalePrice,lmFit7.prediction)

In [17]:
# .1 .05
lmFit7.prediction=exp(predict(lmFit7,newdata=test.sample))
rmsle(test.sample$SalePrice,lmFit7.prediction)

In [20]:
# .05 .05
lmFit7.prediction=exp(predict(lmFit7,newdata=test.sample))
rmsle(test.sample$SalePrice,lmFit7.prediction)

In [23]:
# .05 .001
lmFit7.prediction=exp(predict(lmFit7,newdata=test.sample))
rmsle(test.sample$SalePrice,lmFit7.prediction)

In [26]:
# .05 .01
lmFit7.prediction=exp(predict(lmFit7,newdata=test.sample))
rmsle(test.sample$SalePrice,lmFit7.prediction)

In [29]:
# .01 .05
lmFit7.prediction=exp(predict(lmFit7,newdata=test.sample))
rmsle(test.sample$SalePrice,lmFit7.prediction)

In [32]:
# .01 10
lmFit7.prediction=exp(predict(lmFit7,newdata=test.sample))
rmsle(test.sample$SalePrice,lmFit7.prediction)

In [None]:
# Predictions

In [124]:
train = fread('~/kaggle/house_prices/data/train.csv',
              colClasses=c('MiscFeature'='character','PoolQC'='character','Alley'='character'))
test = fread('~/kaggle/house_prices/data/test.csv',
              colClasses=c('MiscFeature'='character','PoolQC'='character','Alley'='character'))

# Fill some missing values in test
test[661,'BsmtFinSF1']=0
test[661,'TotalBsmtSF']=0
test[1117,'GarageCars']=0

# Il faut renommer les colonnes 1stFlrSF, 2ndFlrSF, et 3SsnPorch pour pas avoir d'emmerdes
FirstFlrSF=train$'1stFlrSF'
SecondFlrSF=train$'2ndFlrSF'
ThreeSsnPorch=train$'3SsnPorch'
new_names = names(train)[-which(names(train)=='1stFlrSF'|names(train)=='2ndFlrSF'|names(train)=='3SsnPorch')]
to_add = data.table(FirstFlrSF,SecondFlrSF,ThreeSsnPorch)
train = cbind(train[,new_names,with=FALSE],to_add)

FirstFlrSF=test$'1stFlrSF'
SecondFlrSF=test$'2ndFlrSF'
ThreeSsnPorch=test$'3SsnPorch'
new_names = names(test)[-which(names(test)=='1stFlrSF'|names(test)=='2ndFlrSF'|names(test)=='3SsnPorch')]
to_add = data.table(FirstFlrSF,SecondFlrSF,ThreeSsnPorch)
test = cbind(test[,new_names,with=FALSE],to_add)

# Transform categorical arguments KitchenQual, ExterQual, BsmtQual, GarageFinish, into numerical

# KitchenQual
nKitchenQual = numeric(length(train$KitchenQual))
nKitchenQual[train$KitchenQual=='TA']=1.0
nKitchenQual[train$KitchenQual=='Gd']=2.0
nKitchenQual[train$KitchenQual=='Ex']=3.0
train=cbind(train,nKitchenQual)

nKitchenQual = numeric(length(test$KitchenQual))
nKitchenQual[test$KitchenQual=='TA']=1.0
nKitchenQual[test$KitchenQual=='Gd']=2.0
nKitchenQual[test$KitchenQual=='Ex']=3.0
test=cbind(test,nKitchenQual)

# ExterQual
nExterQual = numeric(length(test$ExterQual))
nExterQual[test$ExterQual=='TA']=1.0
nExterQual[test$ExterQual=='Gd']=2.0
nExterQual[test$ExterQual=='Ex']=3.0
test=cbind(test,nExterQual)

nExterQual = numeric(length(train$ExterQual))
nExterQual[train$ExterQual=='TA']=1.0
nExterQual[train$ExterQual=='Gd']=2.0
nExterQual[train$ExterQual=='Ex']=3.0
train=cbind(train,nExterQual)

# BsmtQual
nBsmtQual = numeric(length(train$BsmtQual))
nBsmtQual[train$BsmtQual=='TA']=1.0
nBsmtQual[train$BsmtQual=='Gd']=2.0
nBsmtQual[train$BsmtQual=='Ex']=3.0
train=cbind(train,nBsmtQual)

nBsmtQual = numeric(length(test$BsmtQual))
nBsmtQual[test$BsmtQual=='TA']=1.0
nBsmtQual[test$BsmtQual=='Gd']=2.0
nBsmtQual[test$BsmtQual=='Ex']=3.0
test=cbind(test,nBsmtQual)

# GarageFinish
nGarageFinish = numeric(length(train$GarageFinish))
nGarageFinish[train$GarageFinish=='Unf']=1.0
nGarageFinish[train$GarageFinish=='RFn']=2.0
nGarageFinish[train$GarageFinish=='Fin']=3.0
train=cbind(train,nGarageFinish)

nGarageFinish = numeric(length(test$GarageFinish))
nGarageFinish[test$GarageFinish=='Unf']=1.0
nGarageFinish[test$GarageFinish=='RFn']=2.0
nGarageFinish[test$GarageFinish=='Fin']=3.0
test=cbind(test,nGarageFinish)

# Take log of SalePrice
train$log_SalePrice = train[,.(log_SalePrice=log(SalePrice))]

# Take log of a few features
train$log_LotArea = train[,.(log_LotArea=log(LotArea))]
train$log_YearBuilt = train[,.(log_YearBuilt=log(YearBuilt))]
train$log_nBsmtQual = train[,.(log_nBsmtQual=log(1+nBsmtQual))]
train$log_BsmtFinSF1 = train[,.(log(1+BsmtFinSF1))]
train$log_GarageCars = train[,.(log(1+GarageCars))]
train$log_KitchenAbvGr = train[,.(log(1+KitchenAbvGr))]
train$log_BedroomAbvGr = train[,.(log(1+BedroomAbvGr))]

test$log_LotArea = test[,.(log_LotArea=log(LotArea))]
test$log_YearBuilt = test[,.(log_YearBuilt=log(YearBuilt))]
test$log_nBsmtQual = test[,.(log_nBsmtQual=log(1+nBsmtQual))]
test$log_BsmtFinSF1 = test[,.(log(1+BsmtFinSF1))]
test$log_GarageCars = test[,.(log(1+GarageCars))]
test$log_KitchenAbvGr = test[,.(log(1+KitchenAbvGr))]
test$log_BedroomAbvGr = test[,.(log(1+BedroomAbvGr))]

# Try exp of a few features
# None useful

# Try powers of a few features
train$GrLivArea_Square = train$GrLivArea*train$GrLivArea
train$TotalBsmtSF_on_GrLivArea = train$TotalBsmtSF/train$GrLivArea

test$GrLivArea_Square = test$GrLivArea*test$GrLivArea
test$TotalBsmtSF_on_GrLivArea = test$TotalBsmtSF/test$GrLivArea


In [125]:
kept_features7 = c("LotArea","OverallQual","YearBuilt","YearRemodAdd","nKitchenQual","nExterQual",
                   "nBsmtQual","GrLivArea","nGarageFinish",
                  "log_LotArea","log_YearBuilt",
                   "log_nBsmtQual",
                   "BsmtFinSF1","GarageCars","TotalBsmtSF","KitchenAbvGr","BedroomAbvGr","TotRmsAbvGrd","OverallCond",
                  "log_BsmtFinSF1","log_GarageCars","log_KitchenAbvGr","log_BedroomAbvGr",
                    "GrLivArea_Square","TotalBsmtSF_on_GrLivArea")
train.sample = train[,c(kept_features7,"log_SalePrice","SalePrice"),with=FALSE]
test.sample = test[,c(kept_features7),with=FALSE]

In [126]:
ridgeGrid = expand.grid(lambda1 = c(0.05),lambda2 = c(.05))
bootControl <- trainControl(number = 25, verboseIter = TRUE)

In [47]:
ridgeFit = train(log_SalePrice ~ OverallQual +
               log_YearBuilt + 
    log_LotArea + log_BsmtFinSF1 + GarageCars + 
    nBsmtQual + nKitchenQual + TotalBsmtSF + log_KitchenAbvGr + 
    log_nBsmtQual + GrLivArea + log_BedroomAbvGr + log_GarageCars +
    TotalBsmtSF_on_GrLivArea + GrLivArea_Square +
    log_BedroomAbvGr + nKitchenQual,# + OverallQual_3,
              method='penalized',data=train.sample,metric="RMSE",trControl=bootControl,tuneGrid=ridgeGrid)
print(ridgeFit)

+ Resample01: lambda1=0.05, lambda2=0.05 
# nonzero coefficients: 161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161516161516161616161616161616161616161616161616161616161616161616161616161616161616161615161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616

In [127]:
ridgeFit.prediction=exp(predict(ridgeFit,newdata=test.sample))

In [128]:
head(ridgeFit.prediction)
length(ridgeFit.prediction)

In [136]:
test.submission = fread('~/kaggle/house_prices/data/sample_submission.csv')

In [137]:
head(test.submission)
test.submission$SalePrice=ridgeFit.prediction
head(test.submission)

Unnamed: 0,Id,SalePrice
1,1461.0,169277.1
2,1462.0,187758.4
3,1463.0,183583.7
4,1464.0,179317.5
5,1465.0,150730.1
6,1466.0,177151.0


Unnamed: 0,Id,SalePrice
1,1461.0,119695.0
2,1462.0,170995.0
3,1463.0,176127.3
4,1464.0,188662.2
5,1465.0,202286.1
6,1466.0,165497.0


In [139]:
# scores .15199 on leaderboard
write.csv(test.submission,'~/kaggle/house_prices/data/ridge_submission.csv',row.names=FALSE)