# LGB starter for R

This is a minimal kernel without time series tuning etc. and serves as a starting point for more fancy models.

## Packages

Let's load the usual suspects first.

In [1]:
set.seed(15)
library(data.table)
library(tidyverse)
library(caret)
library(lightgbm)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──

[32m✔[39m [34mggplot2[39m 3.2.1.[31m9000[39m     [32m✔[39m [34mpurrr  [39m 0.3.2     
[32m✔[39m [34mtibble [39m 2.1.3          [32m✔[39m [34mdplyr  [39m 0.8.3     
[32m✔[39m [34mtidyr  [39m 1.0.0          [32m✔[39m [34mstringr[39m 1.4.0     
[32m✔[39m [34mreadr  [39m 1.3.1          [32m✔[39m [34mforcats[39m 0.4.0     

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m      masks [34mdata.table[39m::last()
[31m✖[39m [34mpurrr[39m::[32mtrans

## Import

Next, we load the data into a list.


In [2]:
path <- "../input/ashrae-energy-prediction"
all_files <- list.files(path)
raw_list <- lapply(file.path(path, all_files), fread)
names(raw_list) <- gsub("(.*)\\.csv", "\\1", all_files)
lapply(raw_list, dim)

## Data preparation 

We join the data and extract some time features. Since we need to repeat the same for the test data, we wrap the steps into a function.

In [3]:
# Public holiday
holidays <- lubridate::as_datetime(
  c("2016-01-01", "2016-01-18", "2016-02-15", 
    "2016-05-30", "2016-07-04", "2016-09-05", 
    "2016-10-10", "2016-11-11", "2016-11-24",
    "2016-12-26", "2017-01-02", "2017-01-16",
    "2017-02-20", "2017-05-29", "2017-07-04", 
    "2017-09-04", "2017-10-09", "2017-11-10", 
    "2017-11-23", "2017-12-25", "2018-01-01", 
    "2018-01-15", "2018-02-19", "2018-05-28",
    "2018-07-04", "2018-09-03", "2018-10-08", 
    "2018-11-12", "2018-11-22", "2018-12-25", 
    "2019-01-01"))

prepfun <- function(base, weather, building = raw_list$building_metadata) {
   out <- base %>% 
      left_join(building, by = "building_id") %>% 
      left_join(weather, by = c("site_id", "timestamp")) %>% 
      mutate(timestamp = lubridate::as_datetime(timestamp),
             hour = lubridate::hour(timestamp),
             month = lubridate::month(timestamp),
             weekday = lubridate::wday(timestamp),
             is_holiday = (timestamp %in% holidays) + 0,
             square_feet = log(square_feet)) %>% 
      select(-timestamp)
}

# Let's apply it to train
train <- with(raw_list, prepfun(train, weather_train))
head(train)

# Output and input featuers
y <- "meter_reading"
x <- c("square_feet", "building_id", "meter", "air_temperature", "dew_temperature", 
       "primary_use", "year_built", "hour", "site_id", "floor_count", 
       "weekday", "cloud_coverage", "precip_depth_1_hr")
x_cat <- c("building_id", "meter", "primary_use", "site_id")


# Transform response
train[[y]] <- log1p(train[[y]])

Unnamed: 0_level_0,building_id,meter,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,month,weekday,is_holiday
Unnamed: 0_level_1,<int>,<int>,<dbl>,<int>,<chr>,<dbl>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
1,0,0,0,0,Education,8.91355,2008,,25,6,20,,1019.7,0,0,0,1,6,1
2,1,0,0,0,Education,7.908387,2004,,25,6,20,,1019.7,0,0,0,1,6,1
3,2,0,0,0,Education,8.5897,1991,,25,6,20,,1019.7,0,0,0,1,6,1
4,3,0,0,0,Education,10.072597,2002,,25,6,20,,1019.7,0,0,0,1,6,1
5,4,0,0,0,Education,11.666565,1975,,25,6,20,,1019.7,0,0,0,1,6,1
6,5,0,0,0,Education,8.987197,2000,,25,6,20,,1019.7,0,0,0,1,6,1


## Data split

The next step is to split the data set into train and valid.

In [4]:
inds <- caret::groupKFold(group = train[["month"]], k = 10)
valid <- train[-inds[[1]], c(x, y)]
train <- train[inds[[1]], c(x, y)]

## Fit model

Now we are ready to fit a basic tree booster to the data.


In [5]:
# lgb data wrapper
prep_lgb <- function(data, x) {
  data %>% 
    select_at(x) %>% 
    mutate_if(Negate(is.numeric), function(z) as.integer(as.factor(z))) %>% 
    data.matrix()
}
dtrain <- lgb.Dataset(prep_lgb(train, x), label = train[[y]])
dvalid <- lgb.Dataset(prep_lgb(valid, x), label = valid[[y]])
 
params <- list(learning_rate = 0.2,
               feature_fraction = 0.85,
               num_leaves = 40,
               lambda = 2,
               alpha = 0.1,
               nthread = 4,
               objective = "regression",
               max_bin = 63)

fit_lgb <- lgb.train(params = params,
                     data = dtrain,
                     early_stopping_rounds = 100,
                     eval_freq = 100,
                     valids = list(train = dtrain, valid = dvalid),
                     nrounds = 3000)

lgb.save(fit_lgb, "lgb.csv")
# fit_lgb <- lgb.load("lgb.csv")

[1]:	train's l2:4.00621	valid's l2:4.43037 
[101]:	train's l2:1.55522	valid's l2:1.8164 
[201]:	train's l2:1.30534	valid's l2:1.60877 
[301]:	train's l2:1.17657	valid's l2:1.50842 
[401]:	train's l2:1.10001	valid's l2:1.46161 
[501]:	train's l2:1.04724	valid's l2:1.4202 
[601]:	train's l2:1.00716	valid's l2:1.39588 
[701]:	train's l2:0.968704	valid's l2:1.38738 
[801]:	train's l2:0.939047	valid's l2:1.37781 
[901]:	train's l2:0.915133	valid's l2:1.3648 
[1001]:	train's l2:0.892649	valid's l2:1.35745 
[1101]:	train's l2:0.873971	valid's l2:1.35288 
[1201]:	train's l2:0.857764	valid's l2:1.34561 
[1301]:	train's l2:0.844277	valid's l2:1.34695 


## Make submission

This step is rather painful as the test data is quite large. It will take a couple of minutes.

In [6]:
rm(train, valid)
raw_list$train <- NULL
gc()
test <- with(raw_list, prepfun(test, weather_test))
raw_list$test <- NULL
test <- prep_lgb(test, x = x)
pred <- round(exp(predict(fit_lgb, test)) - 1, 4)

raw_list$sample_submission[["meter_reading"]] <- pmax(0, pred)
fwrite(raw_list$sample_submission, "submission.csv", row.names = FALSE)

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,2288287,122.3,25158017,1343.6,40797497,2178.9
Vcells,327977031,2502.3,1206685903,9206.3,1500588846,11448.6
