In [3]:
import pandas as pd
import numpy as np
import seaborn as sns


In [5]:
# Convert the csv to hdf for fast reading:
#pd.read_csv('path/to/train.csv').to_hdf('data/data.h5', 'train')
#pd.read_csv('path/to/test.csv').to_hdf('data/data.h5', 'test')

train_data = pd.read_hdf('data/data.h5', 'train')
test_data = pd.read_hdf('data/data.h5', 'test')


In [6]:
train_data.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0


In [7]:
print("Number of unique srch_id's: %s" % len(train_data.srch_id.unique()))

Number of unique srch_id's: 199795


In [10]:
print("Distribution of clicks: \n%s" % train_data.groupby('click_bool').srch_id.count())
print("Distribution of bookings: \n%s" % train_data.groupby('booking_bool').srch_id.count())
print("Distribution of clicks and bookings: \n%s" % train_data.groupby(['click_bool', 'booking_bool']).srch_id.count())

Distribution of clicks: 
click_bool
0    4736468
1     221879
Name: srch_id, dtype: int64
Distribution of bookings: 
booking_bool
0    4819957
1     138390
Name: srch_id, dtype: int64
Distribution of clicks and bookings: 
click_bool  booking_bool
0           0               4736468
1           0                 83489
            1                138390
Name: srch_id, dtype: int64


In [15]:
print("Ratio of nans per feature %s" % (train_data.isnull().sum(axis=0)/len(train_data)))

Ratio of nans per feature srch_id                        0.000000
date_time                      0.000000
site_id                        0.000000
visitor_location_country_id    0.000000
visitor_hist_starrating        0.949204
visitor_hist_adr_usd           0.948977
prop_country_id                0.000000
prop_id                        0.000000
prop_starrating                0.000000
prop_review_score              0.001485
prop_brand_bool                0.000000
prop_location_score1           0.000000
prop_location_score2           0.219902
prop_log_historical_price      0.000000
position                       0.000000
price_usd                      0.000000
promotion_flag                 0.000000
srch_destination_id            0.000000
srch_length_of_stay            0.000000
srch_booking_window            0.000000
srch_adults_count              0.000000
srch_children_count            0.000000
srch_room_count                0.000000
srch_saturday_night_bool       0.000000
srch_query_aff

In [16]:
print("Ratio of nans per feature %s" % (test_data.isnull().sum(axis=0)/len(test_data)))

Ratio of nans per feature srch_id                        0.000000
date_time                      0.000000
site_id                        0.000000
visitor_location_country_id    0.000000
visitor_hist_starrating        0.948897
visitor_hist_adr_usd           0.948656
prop_country_id                0.000000
prop_id                        0.000000
prop_starrating                0.000000
prop_review_score              0.001465
prop_brand_bool                0.000000
prop_location_score1           0.000000
prop_location_score2           0.219397
prop_log_historical_price      0.000000
price_usd                      0.000000
promotion_flag                 0.000000
srch_destination_id            0.000000
srch_length_of_stay            0.000000
srch_booking_window            0.000000
srch_adults_count              0.000000
srch_children_count            0.000000
srch_room_count                0.000000
srch_saturday_night_bool       0.000000
srch_query_affinity_score      0.935845
orig_destinati

In [18]:
print("%s, %s" % (len(test_data.columns), len(train_data.columns)))
print(set(train_data.columns) - set(test_data.columns))

50, 54
{'click_bool', 'booking_bool', 'position', 'gross_bookings_usd'}


In [42]:
print('Percentages greater than 100: %s' % (train_data.comp2_rate_percent_diff > 200).sum())


Percentages greater than 100: 1429


In [50]:
(train_data.groupby(['site_id', 'visitor_location_country_id']).booking_bool.sum() / train_data.groupby(['site_id', 'visitor_location_country_id']).srch_id.count()).sort_values(ascending=False)

site_id  visitor_location_country_id
14       115                            0.200000
30       164                            0.200000
23       81                             0.200000
18       224                            0.200000
24       53                             0.200000
7        202                            0.166667
27       70                             0.166667
25       128                            0.166667
18       70                             0.166667
32       12                             0.166667
15       156                            0.166667
10       106                            0.166667
29       50                             0.166667
32       16                             0.166667
29       70                             0.166667
15       163                            0.142857
24       71                             0.142857
12       194                            0.142857
32       21                             0.142857
10       200                    

In [52]:
len(train_data.site_id.unique())

34

In [55]:
# Does it matter if visitor has booked @ expedia before?
train_data['visitor_hist_starrating_isnull'] = train_data.visitor_hist_starrating.isnull()
train_data.groupby('visitor_hist_starrating_isnull').click_bool.sum() / train_data.groupby('visitor_hist_starrating_isnull').srch_id.count()

visitor_hist_starrating_isnull
False    0.044238
True     0.044776
dtype: float64

In [58]:
train_data['visitor_hist_starrating_isnull'] = train_data.visitor_hist_starrating.isnull()
train_data.groupby('visitor_hist_starrating_isnull').booking_bool.sum() / train_data.groupby('visitor_hist_starrating_isnull').srch_id.count()

visitor_hist_starrating_isnull
False    0.036083
True     0.027473
dtype: float64

In [56]:
train_data['visitor_hist_adr_usd_isnull'] = train_data.visitor_hist_adr_usd.isnull()
train_data.groupby('visitor_hist_adr_usd_isnull').click_bool.sum() / train_data.groupby('visitor_hist_adr_usd_isnull').srch_id.count()

visitor_hist_adr_usd_isnull
False    0.044279
True     0.044774
dtype: float64

In [57]:
train_data['visitor_hist_adr_usd_isnull'] = train_data.visitor_hist_adr_usd.isnull()
train_data.groupby('visitor_hist_adr_usd_isnull').booking_bool.sum() / train_data.groupby('visitor_hist_adr_usd_isnull').srch_id.count()

visitor_hist_adr_usd_isnull
False    0.036104
True     0.027470
dtype: float64

In [59]:
train_data.visitor_hist_starrating.describe()

count    251866.000000
mean          3.374334
std           0.692519
min           1.410000
25%           2.920000
50%           3.450000
75%           3.930000
max           5.000000
Name: visitor_hist_starrating, dtype: float64