In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
with open('data/training_set_VU_DM_2014.csv', 'r') as csvfile:
    train = pd.read_csv(csvfile)

# Feature Extraction

### Price Difference

In [7]:
#difference in current search price to previous search price
price_difference_series = train.set_index(['date_time']).sort_index().groupby(['prop_id']).apply(lambda x: x.price_usd.diff()).reset_index()
train = train.sort_values(['prop_id','date_time']).reset_index(drop=True)
train['price_difference'] = price_difference_series.price_usd
train.price_difference = train.price_difference.fillna(0)

KeyboardInterrupt: 

### Hotel Quality

In [3]:
#adding 1 so that we do not get missing values
train.booking_bool = train.booking_bool.add(1)
train.click_bool = train.click_bool.add(1)

#number of times each prop_id has been booked
booking_series_ = train.groupby(['booking_bool']).get_group(2).groupby(['prop_id']).count().booking_bool
booking_series = train.groupby(['booking_bool']).get_group(1).groupby(['prop_id']).count().booking_bool

#number of times each prop_id has been clicked
click_series_ = train.groupby(['click_bool']).get_group(2).groupby(['prop_id']).count().click_bool
click_series = train.groupby(['click_bool']).get_group(1).groupby(['prop_id']).count().click_bool

#number of times each prop_id has appeared in all searches
count_series = train.groupby(['prop_id']).count().srch_id

hotel_quality_booking = booking_series.divide(count_series)
hotel_quality_click = click_series.divide(count_series)

#append the hotel quality to the train dataframe
train = train.set_index(['prop_id']).sort_index()
train['hotel_quality_booking'] = hotel_quality_booking
train['hotel_quality_click'] = hotel_quality_click

#reset the index back to normal
train = train.reset_index()

In [9]:
print(
    booking_series
)

prop_id
1          61
2          10
3          80
4          21
5          30
6          10
7           7
8           5
9          17
10          2
11        127
12          3
13          6
14          2
15         89
16         58
17         54
18         13
19         12
20          1
21          1
22          2
24         26
25          2
26          5
28          4
29         10
30         21
31          9
32         33
         ... 
140786     12
140787     52
140788     22
140789      1
140791      6
140793      1
140795    150
140796      6
140797     43
140799     30
140800     52
140801    106
140802     18
140804      9
140805    113
140806      1
140808     13
140809    153
140810     40
140811      9
140812     19
140813     28
140814      8
140815    111
140816    158
140817      3
140818      3
140819      2
140820     18
140821      6
Name: booking_bool, Length: 128647, dtype: int64


### Hotel Position

In [3]:
#position of the hotel in the same destination in previous and next searches
hotel_position_series = train.set_index(['date_time']).sort_index().groupby(['prop_id']).apply(lambda x: x.position.rolling(window=3, center=True).mean()).reset_index()
train = train.sort_values(['prop_id','date_time']).reset_index(drop=True)
train['hotel_position_avg'] = hotel_position_series.position
train['hotel_position_avg'] = train['hotel_position_avg'].fillna(-1)

In [4]:
print(
    train.hotel_position_avg
)

0          -1.000000
1          31.666667
2          36.333333
3          35.000000
4          33.000000
5          32.000000
6          32.000000
7          33.666667
8          29.333333
9          29.333333
10         29.333333
11         28.333333
12         27.333333
13         27.000000
14         32.666667
15         31.333333
16         30.666667
17         26.666667
18         29.000000
19         22.333333
20         24.666667
21         25.333333
22         31.666667
23         25.000000
24         23.666667
25         24.333333
26         28.666667
27         28.666667
28         29.666667
29         34.333333
             ...    
4958317    -1.000000
4958318    20.000000
4958319    -1.000000
4958320    -1.000000
4958321    -1.000000
4958322    -1.000000
4958323    24.666667
4958324    25.333333
4958325    24.666667
4958326    23.333333
4958327    20.333333
4958328    15.666667
4958329    21.333333
4958330    21.000000
4958331    20.333333
4958332    14.666667
4958333    15

### Price Rank

In [8]:
#order of the price within srch_id
train['price_rank'] = train.groupby(['srch_id'])['price_usd'].rank()

### Star Rank

In [9]:
#order of the star rating within srch_id
train['star_rank'] = train.groupby(['srch_id'])['prop_starrating'].rank()

### Price Difference Rank

In [10]:
#difference in price, negative difference ranked higher than positive difference, 
#I.e. if a property reduces in price between searches this is ranked high
train['price_difference_rank'] = train.groupby(['prop_id'])['price_difference'].rank()