-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
3,513 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
|
||
** train / test (merge with store) | ||
- Sales - response | ||
- Customers - cut | ||
|
||
- Store | ||
- DayOfWeek | ||
- Date ~ dateNum , day , month , year | ||
- Open | ||
- Promo | ||
- StateHoliday | ||
- SchoolHoliday | ||
|
||
- StoreType | ||
- Assortment | ||
- CompetitionDistancePred := -1 if Date < CompetitionOpenSinceDate | CompetitionDistance is NA, otherwise CompetitionDistance (even if CompetitionOpenSinceDate is NA) | ||
- Promo2Pred := 0 if Promo2 is 0 | Promo2SinceDate < Date | Date not in Promo2Period , 1 otherwise | ||
- Promo2SinceMonths := # months that Promo2 started | ||
|
||
* evaluate (add these predictors one by one) | ||
- SalesLastYearsDay := Sales of last years in the same shop | ||
- SalesLastYearWeek := Average Sales of last year in the same week | ||
|
||
** feature selection | ||
- Open = 1 | ||
- Sales >0 ? | ||
|
||
** metric | ||
- RMSPE : not takes into account Sales == 0 in days Open (54 cases Shop / day , 41 stores) | ||
- RMSE : takes into account Sales == 0 in days Open | ||
|
||
** resampling | ||
- it seems that train period is before test period (~ 1,5 months) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
library(readr) | ||
library(xgboost) | ||
library(fastfurious) | ||
|
||
### FAST-FURIOUS | ||
ff.setBasePath(path = '/Users/gino/kaggle/fast-furious/gitHub/fast-furious/') | ||
ff.bindPath(type = 'data' , sub_path = 'dataset/rossmann-store-sales') | ||
ff.bindPath(type = 'code' , sub_path = 'competitions/rossmann-store-sales') | ||
ff.bindPath(type = 'elab' , sub_path = 'dataset/rossmann-store-sales/elab') | ||
|
||
ff.bindPath(type = 'ensemble_1' , sub_path = 'dataset/rossmann-store-sales/ensembles/ensemble_1',createDir = T) ## out | ||
ff.bindPath(type = 'best_tune_1' , sub_path = 'dataset/rossmann-store-sales/ensembles/best_tune_1',createDir = T) ## out | ||
ff.bindPath(type = 'submission_1' , sub_path = 'dataset/rossmann-store-sales/ensembles/pred_ensemble_1',createDir = T) ## out | ||
|
||
## DATA | ||
cat("reading the train and test data\n") | ||
train <- read_csv(paste(ff.getPath("data") , "train.csv" , sep='')) | ||
test <- read_csv(paste(ff.getPath("data") , "test.csv" , sep='')) | ||
store <- read_csv(paste(ff.getPath("data") , "store.csv" , sep='')) | ||
|
||
# removing the date column (since elements are extracted) and also StateHoliday which has a lot of NAs (may add it back in later) | ||
train <- merge(train,store) | ||
test <- merge(test,store) | ||
|
||
# There are some NAs in the integer columns so conversion to zero | ||
train[is.na(train)] <- 0 | ||
test[is.na(test)] <- 0 | ||
|
||
cat("train data column names and details\n") | ||
names(train) | ||
str(train) | ||
summary(train) | ||
cat("test data column names and details\n") | ||
names(test) | ||
str(test) | ||
summary(test) | ||
|
||
# looking at only stores that were open in the train set | ||
# may change this later | ||
train <- train[ which(train$Open=='1'),] | ||
train <- train[ which(train$Sales!='0'),] | ||
|
||
# seperating out the elements of the date column for the train set | ||
train$month <- as.integer(format(train$Date, "%m")) | ||
train$year <- as.integer(format(train$Date, "%y")) | ||
train$day <- as.integer(format(train$Date, "%d")) | ||
|
||
# removing the date column (since elements are extracted) and also StateHoliday which has a lot of NAs (may add it back in later) | ||
train <- train[,-c(3,8)] | ||
|
||
# seperating out the elements of the date column for the test set | ||
test$month <- as.integer(format(test$Date, "%m")) | ||
test$year <- as.integer(format(test$Date, "%y")) | ||
test$day <- as.integer(format(test$Date, "%d")) | ||
|
||
# removing the date column (since elements are extracted) and also StateHoliday which has a lot of NAs (may add it back in later) | ||
test <- test[,-c(4,7)] | ||
|
||
feature.names <- names(train)[c(1,2,5:19)] | ||
cat("Feature Names\n") | ||
feature.names | ||
|
||
|
||
cat("assuming text variables are categorical & replacing them with numeric ids\n") | ||
for (f in feature.names) { | ||
if (class(train[[f]])=="character") { | ||
levels <- unique(c(train[[f]], test[[f]])) | ||
train[[f]] <- as.integer(factor(train[[f]], levels=levels)) | ||
test[[f]] <- as.integer(factor(test[[f]], levels=levels)) | ||
} | ||
} | ||
|
||
cat("train data column names after slight feature engineering\n") | ||
names(train) | ||
cat("test data column names after slight feature engineering\n") | ||
names(test) | ||
tra<-train[,feature.names] | ||
RMPSE<- function(preds, dtrain) { | ||
labels <- getinfo(dtrain, "label") | ||
elab<-exp(as.numeric(labels))-1 | ||
epreds<-exp(as.numeric(preds))-1 | ||
err <- sqrt(mean((epreds/elab-1)^2)) | ||
return(list(metric = "RMPSE", value = err)) | ||
} | ||
nrow(train) | ||
h<-sample(nrow(train),10000) | ||
|
||
### exporting to data | ||
train_export <- tra | ||
train_export$Sales <- train$Sales | ||
test_export <- test[,feature.names] | ||
test_export$Id <- test$Id | ||
|
||
write_csv(train_export, paste(ff.getPath("elab") , "Xtrain_bench.csv" , sep='')) | ||
write_csv(test_export, paste(ff.getPath("elab") , "Xtest_bench.csv" , sep='')) | ||
### end exporting to data | ||
|
||
## | ||
dval<-xgb.DMatrix(data=data.matrix(tra[h,]),label=log(train$Sales+1)[h]) | ||
dtrain<-xgb.DMatrix(data=data.matrix(tra[-h,]),label=log(train$Sales+1)[-h]) | ||
watchlist<-list(val=dval,train=dtrain) | ||
param <- list( objective = "reg:linear", | ||
booster = "gbtree", | ||
eta = 0.005, # 0.06, #0.01, | ||
max_depth = 10, #changed from default of 8 | ||
subsample = 0.9, # 0.7 | ||
colsample_bytree = 0.7 # 0.7 | ||
#num_parallel_tree = 2 | ||
# alpha = 0.0001, | ||
# lambda = 1 | ||
) | ||
cat("****** param ***** \n") | ||
print(param) | ||
#### | ||
clf <- xgb.train( params = param, | ||
data = dtrain, | ||
nrounds = 14000, #300, #280, #125, #250, # changed from 300 | ||
verbose = 0, | ||
early.stop.round = 100, | ||
watchlist = watchlist, | ||
maximize = FALSE, | ||
feval=RMPSE | ||
) | ||
pred1 <- exp(predict(clf, data.matrix(test[,feature.names]))) -1 | ||
submission <- data.frame(Id=test$Id, Sales=pred1) | ||
cat("saving the submission file\n") | ||
write_csv(submission, paste(ff.getPath("elab") , "bech_eta0005_nround14000.csv" , sep='')) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.