In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
with open('data/training_set_VU_DM_2014.csv', 'r') as csvfile:
    train = pd.read_csv(csvfile)

# Feature Extraction

### Price Difference

In [5]:
#difference in current search price to previous search price
price_difference_series = train.set_index(['date_time']).sort_index().groupby(['prop_id']).apply(lambda x: x.price_usd.diff()).reset_index()
train = train.sort_values(['prop_id','date_time']).reset_index(drop=True)
train['price_difference'] = price_difference_series.price_usd

### Hotel Quality

In [6]:
#number of times each prop_id has been booked
booking_series = train.groupby(['booking_bool']).get_group(1).groupby(['prop_id']).count().booking_bool

#number of times each prop_id has been clicked
click_series = train.groupby(['click_bool']).get_group(1).groupby(['prop_id']).count().click_bool

#number of times each prop_id has appeared in all searches
count_series = train.groupby(['prop_id']).count().srch_id

hotel_quality_booking = booking_series.divide(count_series)
hotel_quality_click = click_series.divide(count_series)

#append the hotel quality to the train dataframe
train = train.set_index(['prop_id']).sort_index()
train['hotel_quality_booking'] = hotel_quality_booking
train['hotel_quality_click'] = hotel_quality_click

#reset the index back to normal
train = train.reset_index()

### Hotel Position

In [7]:
#position of the hotel in the same destination in previous and next searches
hotel_position_series = train.set_index(['date_time']).sort_index().groupby(['prop_id']).apply(lambda x: x.position.rolling(window=3, center=True).mean()).reset_index()
train = train.sort_values(['prop_id','date_time']).reset_index(drop=True)
train['hotel_position_avg'] = hotel_position_series.position

### Price Rank

In [8]:
#order of the price within srch_id
train['price_rank'] = train.groupby(['srch_id'])['price_usd'].rank()

### Star Rank

In [9]:
#order of the star rating within srch_id
train['star_rank'] = train.groupby(['srch_id'])['prop_starrating'].rank()

### Price Difference Rank

In [10]:
#difference in price, negative difference ranked higher than positive difference, 
#I.e. if a property reduces in price between searches this is ranked high
train['price_difference_rank'] = train.groupby(['prop_id'])['price_difference'].rank()

# Feature Normalisation

### log(price_usd)

In [16]:
#only used as the missing value analysis is not complete
train.price_usd[train.price_usd == 0] = 1
train['price_usd_log10'] = np.log10(train.price_usd)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [27]:
print(
    pd.cut(train.price_usd_log10, 10, retbins=True)
)

(0          (1.718, 2.648]
1          (1.718, 2.648]
2          (1.718, 2.648]
3          (1.718, 2.648]
4          (1.718, 2.648]
5          (1.718, 2.648]
6          (1.718, 2.648]
7          (1.718, 2.648]
8          (1.718, 2.648]
9          (1.718, 2.648]
10         (1.718, 2.648]
11         (1.718, 2.648]
12         (1.718, 2.648]
13         (1.718, 2.648]
14         (1.718, 2.648]
15         (1.718, 2.648]
16         (1.718, 2.648]
17         (1.718, 2.648]
18         (1.718, 2.648]
19         (1.718, 2.648]
20         (1.718, 2.648]
21         (1.718, 2.648]
22         (1.718, 2.648]
23         (1.718, 2.648]
24         (1.718, 2.648]
25         (1.718, 2.648]
26         (1.718, 2.648]
27         (1.718, 2.648]
28         (1.718, 2.648]
29         (1.718, 2.648]
                ...      
4958317    (1.718, 2.648]
4958318    (1.718, 2.648]
4958319    (1.718, 2.648]
4958320    (1.718, 2.648]
4958321    (1.718, 2.648]
4958322    (1.718, 2.648]
4958323    (1.718, 2.648]
4958324    

### log(price_usd) w.r.t. srch_id

### log(price_usd) w.r.t. prop_id

### log(price_usd) w.r.t. srch_destination_id