# [Expedia logs of customer behavior](https://www.kaggle.com/c/expedia-hotel-recommendations)

> These include what customers searched for, how they interacted with search results (click/book), whether or not the search result was a travel package.
> Expedia is interested in predicting which hotel group a user is going to book. Expedia has in-house algorithms to form hotel clusters, where similar hotels for a search (based on historical price, customer star ratings, geographical locations relative to city center, etc) are grouped together.


### Training/Test Data

Column name               | Description                                                                      | Data type
------------------------- | -------------------------------------------------------------------------------- | ---------
date_time                 | Timestamp                                                                        | string
site_name                 | ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, ...) | int
posa_continent            | ID of continent associated with site_name                                        | int
user_location_country     | The ID of the country the customer is located                                    | int
user_location_region      | The ID of the region the customer is located                                     | int
user_location_city        | The ID of the city the customer is located                                       | int
orig_destination_distance | Physical distance between a hotel and a customer at the time of search. <br> A null means the distance could not be calculated                                     | double
user_id                   | ID of user                                                                       | int
is_mobile                 | 1 when a user connected from a mobile device, 0 otherwise                        | tinyint
is_package                | 1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise                                             | int
channel                   | ID of a marketing channel                                                        | int
srch_ci                   | Checkin date                                                                     | string
srch_co                   | Checkout date                                                                    | string
srch_adults_cnt           | The number of adults specified in the hotel room                                 | int
srch_children_cnt         | The number of (extra occupancy) children specified in the hotel room             | int
srch_rm_cnt               | The number of hotel rooms specified in the search                                | int
srch_destination_id       | ID of the destination where the hotel search was performed                       | int
srch_destination_type_id  | Type of destination                                                              | int
hotel_continent           | Hotel continent                                                                  | int
hotel_country             | Hotel country                                                                    | int
hotel_market              | Hotel market                                                                     | int
is_booking                | 1 if a booking, 0 if a click                                                     | tinyint
cnt                       | Numer of similar events in the context of the same user session                  | bigint
hotel_cluster             | ID of a hotel cluster                                                            | int

### Destinations

Column name         | Description                                                | Data type
------------------- | ---------------------------------------------------------- | ---------
srch_destination_id | ID of the destination where the hotel search was performed | int
d1-d149             | latent description of search regions                       | double


# Data Exploration

In [None]:
Sys.setenv(LANG = "en_US.UTF-8")
Sys.setlocale("LC_TIME", "en_US.UTF-8")

library(ggplot2)    # Data visualization
library(data.table) # Faster data reading
library(dplyr)      # Data aggregation etc.
library(scales)     # Plot scaling
#library(Rmisc)      # multiplot
#library(gridExtra)  # Arrange plots
#library(corrplot)   # Correlations

In [None]:
train <- fread("zcat data/expedia/train1M.csv.gz | head -500000", sep = ",", header = TRUE)

In [None]:
# convert to factors
cols <- colnames(train)
for (i in cols[!cols %in% c("date_time", "orig_destination_distance")]) {
  train[[i]] <- as.factor(train[[i]])
}

# convert dates
train$date_time <- as.POSIXct(train$date_time)
train$date <- as.Date(train$date_time)
train$srch_ci <- as.Date(train$srch_ci)
train$srch_co <- as.Date(train$srch_co)

In [None]:
str(train)

## Booking Analysis

In [None]:
train.agg <- train %>% group_by(date, is_booking) %>% summarize(count=n())
ggplot(train.agg, aes(x=date, y=count, color=is_booking)) + geom_line(size=0.2) + theme(legend.position="top")

### Booking in advance

In [None]:
ggplot(train[train$is_booking == 1], aes(x=date, y=srch_ci)) + geom_point(color="blue", alpha=0.1, size=0.4)

### Influence of Weekdays

In [None]:
#train$week <- cut(train$date, "weeks")
train$Year  <- as.numeric(format(train$date_time, "%Y"))
train$Month <- as.numeric(format(train$date_time, "%m"))
train$Hour <- as.numeric(format(train$date_time, "%H"))

# add weekdays (with ordered levels)
wd <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
train$weekday <- factor(weekdays(train$date, TRUE), levels = wd)
train$weekday_ci <- factor(weekdays(train$srch_ci, TRUE), levels = wd)
train$weekday_co <- factor(weekdays(train$srch_co, TRUE), levels = wd)

# more feature engineering
train$durStay <- as.numeric(train$srch_co - train$srch_ci)
train$durStay[train$srch_ci > train$srch_co] <- -1

train$timeToDep <- as.numeric(train$srch_ci - train$date)
train$timeToDep[train$timeToDep < 0] <- -1
train$timeToDep[train$timeToDep > 1000] <- -1

In [None]:
srch_wd <- ggplot(train[train$is_booking == 0], aes(x=weekday, fill=weekday)) + geom_bar() + theme(legend.position="none")
book_wd <- ggplot(train[train$is_booking == 1], aes(x=weekday, fill=weekday)) + geom_bar() + theme(legend.position="none")
srch_wd
book_wd
#multiplot(srch_wd, book_wd, cols=1)
#grid.arrange(srch_wd, book_wd, nrow=2, ncol=1)