In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
# set the number of diplayed columns in jupyter notebook to 100
pd.set_option('display.max_columns', 100)

In [3]:
df_train = pd.read_csv('data/training_set_VU_DM.csv')
df_train['is_test'] = False
df_test = pd.read_csv('data/test_set_VU_DM.csv')
#df_test = pd.read_csv('data/test_revealed.csv')
df_test['is_test'] = True

In [4]:
# There is no correspondence between the srch_id's of the train and test sets, so we need to make them unique for further aggregation purposes
TEST_SRCH_ID_OFFSET = df_train.srch_id.max()
df_test['srch_id'] += TEST_SRCH_ID_OFFSET
assert len(set(df_test.srch_id.unique()).intersection(set(df_train.srch_id.unique()))) == 0

In [5]:
df_joined = pd.concat([df_train, df_test], axis=0, ignore_index=True)
df_joined.shape

(9917530, 55)

In [6]:

#del df_train, df_test
# call the garbage collector to free up memory
import gc
gc.collect()

0

In [7]:
df = df_joined.copy()
gc.collect()

0

# NAs

In [8]:
def print_missing(df, col):
    miss_cnt = df[col].isna().sum()
    total_cnt = len(df[col])
    perc_miss = 100 * miss_cnt / total_cnt
    print(f'{col} missing count {miss_cnt} out of {total_cnt} => {round(perc_miss)}% missing')

for col in [c for c in df.columns if df[c].isna().any() and not c.startswith('comp')]:
    print_missing(df, col)

visitor_hist_starrating missing count 9412233 out of 9917530 => 95% missing
visitor_hist_adr_usd missing count 9409918 out of 9917530 => 95% missing
prop_review_score missing count 14630 out of 9917530 => 0% missing
prop_location_score2 missing count 2178380 out of 9917530 => 22% missing
position missing count 4959183 out of 9917530 => 50% missing
srch_query_affinity_score missing count 9281966 out of 9917530 => 94% missing
orig_destination_distance missing count 3216461 out of 9917530 => 32% missing
click_bool missing count 4959183 out of 9917530 => 50% missing
gross_bookings_usd missing count 9779140 out of 9917530 => 99% missing
booking_bool missing count 4959183 out of 9917530 => 50% missing


In [9]:
# zero have special meaning, we don't want this value to interfere with various calculations, we will deal with NAs in derived values later
df['prop_starrating_w0'] = df['prop_starrating'].copy()
df.loc[df.prop_starrating_w0==0, 'prop_starrating_w0'] = pd.NA
df.loc[df.prop_review_score==0, 'prop_review_score'] = pd.NA

In [10]:
for col in [c for c in df.columns if df[c].isna().any() and not c.startswith('comp')]:
    print_missing(df, col)

# prop_starrating missing count 337794 out of 9917530 => 3% missing
# prop_review_score missing count 482116 out of 9917530 => 5% missing

visitor_hist_starrating missing count 9412233 out of 9917530 => 95% missing
visitor_hist_adr_usd missing count 9409918 out of 9917530 => 95% missing
prop_review_score missing count 482116 out of 9917530 => 5% missing
prop_location_score2 missing count 2178380 out of 9917530 => 22% missing
position missing count 4959183 out of 9917530 => 50% missing
srch_query_affinity_score missing count 9281966 out of 9917530 => 94% missing
orig_destination_distance missing count 3216461 out of 9917530 => 32% missing
click_bool missing count 4959183 out of 9917530 => 50% missing
gross_bookings_usd missing count 9779140 out of 9917530 => 99% missing
booking_bool missing count 4959183 out of 9917530 => 50% missing
prop_starrating_w0 missing count 337794 out of 9917530 => 3% missing


# Reducing number of columns

In [11]:
# Aggregate competitor information - perhaps not

# # add flag to indicate if any competitor has availability at a better rate
# for i in range(1, 9):
#     df[f'comp{i}_known'] = ~(df[f'comp{i}_rate'].isna() | df[f'comp{i}_inv'].isna())
#     df[f'comp{i}_better'] = df[f'comp{i}_known'] & (df[f'comp{i}_rate']==-1) & (df[f'comp{i}_inv']<=0)
#     df[f'comp{i}_worse'] = df[f'comp{i}_known'] & (df[f'comp{i}_rate']==1) & (df[f'comp{i}_inv']>=0)
#
# df['comp_known_cnt'] = sum([df[f'comp{i}_known'].astype(int) for i in range(1, 9)])
# df['comp_better_worse'] = \
#     (sum([df[f'comp{i}_better'].astype(int) for i in range(1, 9)])
#      -sum([df[f'comp{i}_worse'].astype(int) for i in range(1, 9)]))

In [12]:
#comp_rate_cols = [f'comp{i}_rate' for i in range(1, 9)]
#df['comp_rate_sum'] = df[comp_rate_cols].fillna(0).sum(axis=1)

In [13]:
# TODO: we could get rid of 'comp_better_cnt' > X since their effect on booking and click probability seems similar
# df['comp_better_cnt'] = np.miminum(df['comp_better_cnt'], 4)

# TODO: we could get rid of 'comp_known_cnt' > X
# df['comp_known_cnt'] = np.miminum(df['comp_known_cnt'], 4)

In [14]:
# TODO: add best discount
# for i in range(1, 9):
#     df[f'comp{i}_discount'] = \
#         (df[f'comp{i}_known']).astype(int) \
#         * (df[f'comp{i}_inv']<=0).astype(int) \
#         * (df[f'comp{i}_rate']!=0).astype(int) \
#         * (df[f'comp{i}_rate_percent_diff']<55).astype(int) \
#         * -1 * df[f'comp{i}_rate'] \
#         * df[f'comp{i}_rate_percent_diff']
#
# df['comp_best_discount'] = df[[f'comp{i}_discount' for i in range(1, 9)]].max(axis=1)
# df['comp_best_discount'].fillna(0, inplace=True)

In [15]:
#comps = [f'comp{i}_' for i in range(1, 9)]
#df.drop(columns=[c for c in df.columns if c[:6] in comps], inplace=True)

In [16]:
df.drop(columns=['gross_bookings_usd'], inplace=True)

In [17]:
# TODO: some aggregation might work?
df.drop(columns=['orig_destination_distance'], inplace=True)

In [18]:
df[df.prop_log_historical_price == 0]['prop_log_historical_price'] = pd.NA

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df.prop_log_historical_price == 0]['prop_log_historical_price'] = pd.NA


In [19]:
df

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool,is_test,prop_starrating_w0
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,1,2.83,0.0438,4.95,27.0,104.77,0,23246,1,0,4,0,1,1,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0.0,0.0,False,3.0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,1,2.20,0.0149,5.03,26.0,170.74,0,23246,1,0,4,0,1,1,,1,,,,,,,0.0,0.0,,,,,0.0,1.0,,,,,,,,0.0,0.0,,0.0,0.0,False,4.0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,1,2.20,0.0245,4.92,21.0,179.80,0,23246,1,0,4,0,1,1,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0.0,0.0,False,3.0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,1,2.83,0.0125,4.39,34.0,602.77,0,23246,1,0,4,0,1,1,,1,,,,-1.0,0.0,5.0,-1.0,0.0,5.0,,,,0.0,1.0,,,,,,,,-1.0,0.0,5.0,0.0,0.0,False,2.0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,1,2.64,0.1241,4.93,4.0,143.58,0,23246,1,0,4,0,1,1,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0.0,0.0,False,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9917525,665572,2013-05-21 11:06:37,24,216,,,117,32019,4,3.5,0,2.48,0.0551,4.53,,66.07,0,19246,2,7,1,0,1,0,,0,,,,1.0,0.0,22.0,1.0,0.0,127.0,-1.0,0.0,27.0,1.0,0.0,22.0,,,,,,,,,,,,True,4.0
9917526,665572,2013-05-21 11:06:37,24,216,,,117,33959,4,3.0,1,2.20,0.3344,4.39,,67.10,0,19246,2,7,1,0,1,0,,0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,16.0,1.0,0.0,22.0,,,,,,,,,,,,True,4.0
9917527,665572,2013-05-21 11:06:37,24,216,,,117,35240,4,,0,1.79,,4.64,,73.91,0,19246,2,7,1,0,1,0,,0,,,,1.0,0.0,55.0,0.0,0.0,,0.0,0.0,16.0,0.0,0.0,3.0,,,,,,,,,,,,True,4.0
9917528,665572,2013-05-21 11:06:37,24,216,,,117,94437,4,,0,2.94,0.0928,4.64,,66.07,0,19246,2,7,1,0,1,0,,0,,,,1.0,0.0,43.0,1.0,0.0,43.0,-1.0,0.0,12.0,-1.0,0.0,12.0,,,,,,,,,,,,True,4.0


# date_time

In [20]:
def convert_date(df):
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['booking_week'] = df['date_time'].dt.isocalendar().week
    df['booking_month'] = df['date_time'].dt.month
    df['booking_dayofyear'] = df['date_time'].dt.dayofyear
    df['booking_dayofweek'] = df['date_time'].dt.dayofweek
    midstay = df['date_time'] + pd.to_timedelta(df.srch_booking_window, unit='days') + pd.to_timedelta(df.srch_length_of_stay//2, unit='days')
    df['midstay_week'] = midstay.dt.isocalendar().week
    df['midstay_month'] = midstay.dt.month
    df['midstay_dayofyear'] = midstay.dt.dayofyear
    df['midstay_dayofweek'] = midstay.dt.dayofweek
    df.drop(columns='date_time', inplace=True)

convert_date(df)

In [21]:
df

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool,is_test,prop_starrating_w0,booking_week,booking_month,booking_dayofyear,booking_dayofweek,midstay_week,midstay_month,midstay_dayofyear,midstay_dayofweek
0,1,12,187,,,219,893,3,3.5,1,2.83,0.0438,4.95,27.0,104.77,0,23246,1,0,4,0,1,1,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0.0,0.0,False,3.0,14,4,94,3,14,4,94,3
1,1,12,187,,,219,10404,4,4.0,1,2.20,0.0149,5.03,26.0,170.74,0,23246,1,0,4,0,1,1,,1,,,,,,,0.0,0.0,,,,,0.0,1.0,,,,,,,,0.0,0.0,,0.0,0.0,False,4.0,14,4,94,3,14,4,94,3
2,1,12,187,,,219,21315,3,4.5,1,2.20,0.0245,4.92,21.0,179.80,0,23246,1,0,4,0,1,1,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0.0,0.0,False,3.0,14,4,94,3,14,4,94,3
3,1,12,187,,,219,27348,2,4.0,1,2.83,0.0125,4.39,34.0,602.77,0,23246,1,0,4,0,1,1,,1,,,,-1.0,0.0,5.0,-1.0,0.0,5.0,,,,0.0,1.0,,,,,,,,-1.0,0.0,5.0,0.0,0.0,False,2.0,14,4,94,3,14,4,94,3
4,1,12,187,,,219,29604,4,3.5,1,2.64,0.1241,4.93,4.0,143.58,0,23246,1,0,4,0,1,1,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0.0,0.0,False,4.0,14,4,94,3,14,4,94,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9917525,665572,24,216,,,117,32019,4,3.5,0,2.48,0.0551,4.53,,66.07,0,19246,2,7,1,0,1,0,,0,,,,1.0,0.0,22.0,1.0,0.0,127.0,-1.0,0.0,27.0,1.0,0.0,22.0,,,,,,,,,,,,True,4.0,21,5,141,1,22,5,149,2
9917526,665572,24,216,,,117,33959,4,3.0,1,2.20,0.3344,4.39,,67.10,0,19246,2,7,1,0,1,0,,0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,16.0,1.0,0.0,22.0,,,,,,,,,,,,True,4.0,21,5,141,1,22,5,149,2
9917527,665572,24,216,,,117,35240,4,,0,1.79,,4.64,,73.91,0,19246,2,7,1,0,1,0,,0,,,,1.0,0.0,55.0,0.0,0.0,,0.0,0.0,16.0,0.0,0.0,3.0,,,,,,,,,,,,True,4.0,21,5,141,1,22,5,149,2
9917528,665572,24,216,,,117,94437,4,,0,2.94,0.0928,4.64,,66.07,0,19246,2,7,1,0,1,0,,0,,,,1.0,0.0,43.0,1.0,0.0,43.0,-1.0,0.0,12.0,-1.0,0.0,12.0,,,,,,,,,,,,True,4.0,21,5,141,1,22,5,149,2


In [22]:
# TODO:
# get a smart booking_period_congestion (per prop, dest)
# for each property and destination get a day-of-year load schedule
# for each search period do a sum over the booked days, this is the congestion of the booking

# Numerical column normalization

In [23]:
# get average price per property - not a good idea, price can be total or per-room, with or without taxes
# get average price per destination, prop_starrating - not a good idea, price can be total or per-room, with or without taxes
# get average price per destination, prop_starrating, prop_review_score - same
# get average price per destination - same

## prop_starrating

In [24]:
def to_list(v):
    if hasattr(v, '__iter__') and type(v) != str:
        return v
    else:
        return [v]

In [25]:
def make_aggregate_feature(groupby, col, transform='z', use_median=False, filter_query=None):
    gl = to_list(groupby)
    l = list(set(gl + ['srch_id', 'prop_id', col]))
    tmp = df.query(filter_query) if filter_query else df
    tmp = tmp[l].groupby(groupby).agg(
        col_avg = (col, 'mean'),
        col_med = (col, 'median'),
        col_std = (col, 'std'),
    )
    print(f'{tmp.col_std.isna().sum() / len(tmp):.2} NAs')

    tmp1 = df[l].set_index(gl)
    tmp1 = tmp1.join(tmp, how='left')
    if transform == 'z':
        res_col = f'{col}_z_{"__".join(gl)}'
        tmp1[res_col] = (tmp1[col] - (tmp1.col_med if use_median else tmp1.col_avg)) / tmp1.col_std
    elif transform == 'log_diff':
        res_col = f'{col}_ld_{"__".join(gl)}'
        tmp1[res_col] = np.log(tmp1[col]) - np.log(tmp1.col_med if use_median else tmp1.col_avg)
    elif transform == 'diff':
        res_col = f'{col}_d_{"__".join(gl)}'
        tmp1[res_col] = tmp1[col] - (tmp1.col_med if use_median else tmp1.col_avg)
    else:
        assert False, f'unsupported transform "{transform}"'
    tmp1 = tmp1.reset_index().set_index(['srch_id', 'prop_id']).sort_index()
    return tmp1.drop(columns=[c for c in tmp1.columns if c != res_col])

In [26]:
# the average property star rating per destination
tosub_prop_starrating_per_dest = make_aggregate_feature(groupby ='srch_destination_id', col ='prop_starrating_w0', transform='diff', use_median=True)
tosub_prop_starrating_per_dest

0.0023 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_starrating_w0_d_srch_destination_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,0.0
1,10404,1.0
1,21315,0.0
1,27348,-1.0
1,29604,1.0
...,...,...
665572,32019,0.0
665572,33959,0.0
665572,35240,0.0
665572,94437,0.0


In [27]:
norm_prop_starrating_per_srch = make_aggregate_feature(groupby ='srch_id', col ='prop_starrating_w0', transform='diff', use_median=True)
norm_prop_starrating_per_srch

0.00096 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_starrating_w0_d_srch_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,0.0
1,10404,1.0
1,21315,0.0
1,27348,-1.0
1,29604,1.0
...,...,...
665572,32019,0.0
665572,33959,0.0
665572,35240,0.0
665572,94437,0.0


In [28]:
df['hist_starrating_diff'] = df['visitor_hist_starrating'] - df['prop_starrating']

## prop_review_score

In [29]:
tosub_prop_review_score_per_dest = make_aggregate_feature(groupby = ['srch_destination_id'], col ='prop_review_score', transform='diff', use_median=True)
tosub_prop_review_score_per_dest

0.0051 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_review_score_d_srch_destination_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,-0.5
1,10404,0.0
1,21315,0.5
1,27348,0.0
1,29604,-0.5
...,...,...
665572,32019,-0.5
665572,33959,-1.0
665572,35240,
665572,94437,


In [30]:
tosub_prop_review_score_per_dest_prop_starrating = make_aggregate_feature(groupby = ['srch_destination_id', 'prop_starrating'], col ='prop_review_score', transform='diff', use_median=True)
tosub_prop_review_score_per_dest_prop_starrating

0.16 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_review_score_d_srch_destination_id__prop_starrating
srch_id,prop_id,Unnamed: 2_level_1
1,893,-0.5
1,10404,0.0
1,21315,0.5
1,27348,0.5
1,29604,-0.5
...,...,...
665572,32019,-0.5
665572,33959,-1.0
665572,35240,
665572,94437,


In [31]:
norm_prop_review_score_per_srch = make_aggregate_feature(groupby ='srch_id', col ='prop_review_score', transform='diff', use_median=True)
norm_prop_review_score_per_srch

0.00086 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_review_score_d_srch_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,-0.25
1,10404,0.25
1,21315,0.75
1,27348,0.25
1,29604,-0.25
...,...,...
665572,32019,-1.00
665572,33959,-1.50
665572,35240,
665572,94437,


## prop_location_score1

In [32]:
tosub_prop_location_score1_per_dest = make_aggregate_feature(groupby = ['srch_destination_id'], col ='prop_location_score1', transform='diff', use_median=True)
tosub_prop_location_score1_per_dest

0.0 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_location_score1_d_srch_destination_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,0.53
1,10404,-0.10
1,21315,-0.10
1,27348,0.53
1,29604,0.34
...,...,...
665572,32019,0.18
665572,33959,-0.10
665572,35240,-0.51
665572,94437,0.64


In [33]:
tosub_prop_location_score1_per_dest_prop_starrating = make_aggregate_feature(groupby = ['srch_destination_id', 'prop_starrating'], col ='prop_location_score1', transform='diff', use_median=True)
tosub_prop_location_score1_per_dest_prop_starrating

0.1 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_location_score1_d_srch_destination_id__prop_starrating
srch_id,prop_id,Unnamed: 2_level_1
1,893,0.63
1,10404,-0.44
1,21315,0.00
1,27348,0.63
1,29604,0.00
...,...,...
665572,32019,0.00
665572,33959,-0.28
665572,35240,-0.69
665572,94437,0.46


In [34]:
# do multiple srch_ids correspond to one srch_destination_id?
df.groupby('srch_destination_id').srch_id.nunique() # => yes

srch_destination_id
2        3
3        1
5        2
6        2
7        1
        ..
28412    1
28413    1
28414    1
28415    1
28416    5
Name: srch_id, Length: 23715, dtype: int64

In [35]:
norm_prop_location_score1_per_srch = make_aggregate_feature(groupby=['srch_id'], col='prop_location_score1', transform='diff', use_median=True)
norm_prop_location_score1_per_srch

0.0 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_location_score1_d_srch_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,0.53
1,10404,-0.10
1,21315,-0.10
1,27348,0.53
1,29604,0.34
...,...,...
665572,32019,0.18
665572,33959,-0.10
665572,35240,-0.51
665572,94437,0.64


In [36]:
#make_aggregate_feature(groupby=['srch_id', 'prop_starrating'], col='prop_location_score1')

## prop_location_score2

In [37]:
tosub_prop_location_score2_per_dest_id = make_aggregate_feature(groupby = ['srch_destination_id'], col ='prop_location_score2', transform='diff', use_median=True)
tosub_prop_location_score2_per_dest_id

0.26 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_location_score2_d_srch_destination_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,0.0274
1,10404,-0.0015
1,21315,0.0081
1,27348,-0.0039
1,29604,0.1077
...,...,...
665572,32019,-0.0328
665572,33959,0.2465
665572,35240,
665572,94437,0.0049


In [38]:
tosub_prop_location_score2_per_dest_id_prop_starrating = make_aggregate_feature(groupby=['srch_destination_id', 'prop_starrating'], col='prop_location_score2', transform='diff', use_median=True)
tosub_prop_location_score2_per_dest_id_prop_starrating

0.52 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_location_score2_d_srch_destination_id__prop_starrating
srch_id,prop_id,Unnamed: 2_level_1
1,893,0.0274
1,10404,-0.0511
1,21315,0.0081
1,27348,0.0000
1,29604,0.0581
...,...,...
665572,32019,0.0000
665572,33959,0.2793
665572,35240,
665572,94437,0.0377


In [39]:
norm_prop_location_score2_per_srch = make_aggregate_feature(groupby=['srch_id'], col='prop_location_score2', transform='diff', use_median=True)
norm_prop_location_score2_per_srch

0.05 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_location_score2_d_srch_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,0.01375
1,10404,-0.01515
1,21315,-0.00555
1,27348,-0.01755
1,29604,0.09405
...,...,...
665572,32019,-0.01885
665572,33959,0.26045
665572,35240,
665572,94437,0.01885


## prop_log_historical_price

In [40]:
(df.prop_log_historical_price==0).sum() / len(df)

0.14423218281164765

In [41]:
# number of booked properties
df_train.query('booking_bool==1').prop_id.nunique() / df_train.prop_id.nunique()

0.336356524904541

In [42]:
prop_with_0hist = df_train[df_train.prop_log_historical_price==0].prop_id.unique()
df_train[df_train.prop_id.isin(prop_with_0hist)].groupby(['prop_id', 'prop_log_historical_price', 'price_usd']).booking_bool.sum()

prop_id  prop_log_historical_price  price_usd
1        0.00                       90.00        0
                                    95.00        0
                                    99.00        0
                                    99.24        0
                                    108.00       0
                                                ..
140820   5.29                       108.62       0
                                    151.89       0
                                    169.00       0
         5.35                       206.14       0
         5.37                       228.88       0
Name: booking_bool, Length: 3459455, dtype: int64

In [43]:
# prop_log_historical_price has multiple value (changing over time). 0 has a special meaning so we set to to NA
df.loc[df.prop_log_historical_price==0, 'prop_log_historical_price'] = pd.NA

## price_usd

The price_usd is the displayed price of the hotel for the given search. Note that different countries have different conventions regarding displaying taxes and fees and the value may be per night or for the whole stay

Note: this implies that it's not really correct to normalize price across anything other than search_id without first transforming it to price per night (even so, the taxes are not necessarily accounted for). As such, it's not really comparable with the historical price either.

In [44]:
# A zero price is nonsensical
df.loc[df.price_usd==0, 'price_usd'] = pd.NA

In [45]:
# relative to the historical price (difference of logs is the same price_usd/exp(prop_log_historical_price) - negative bad, positive good
price_hist_logdiff = np.log(df['price_usd']) - df['prop_log_historical_price']
df['price_hist_logdiff'] = price_hist_logdiff

In [46]:
# how good of a deal this is relative to others in the search query
norm_prop_price_hist_advantage_per_srch_id = make_aggregate_feature(['srch_id'], 'price_hist_logdiff', transform='diff', use_median=True)
norm_prop_price_hist_advantage_per_srch_id

0.14 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,price_hist_logdiff_d_srch_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,-0.386156
1,10404,0.022218
1,21315,0.183921
1,27348,1.923612
1,29604,-0.051031
...,...,...
665572,32019,-0.002133
665572,33959,0.153336
665572,35240,0.000000
665572,94437,-0.112133


In [47]:
norm_price_per_srch = make_aggregate_feature(['srch_id'], 'price_usd', transform='log_diff', use_median=True)
norm_price_per_srch

0.0 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,price_usd_ld_srch_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,-0.283066
1,10404,0.205308
1,21315,0.257012
1,27348,1.466702
1,29604,0.032059
...,...,...
665572,32019,-0.058494
665572,33959,-0.043025
665572,35240,0.053639
665572,94437,-0.058494


In [48]:
norm_price_per_search_prop_starrating = make_aggregate_feature(['srch_id', 'prop_starrating'], 'price_usd', transform='log_diff', use_median=True)
norm_price_per_search_prop_starrating

0.16 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,price_usd_ld_srch_id__prop_starrating
srch_id,prop_id,Unnamed: 2_level_1
1,893,-0.278381
1,10404,0.112125
1,21315,0.261697
1,27348,1.749768
1,29604,-0.061124
...,...,...
665572,32019,-0.037213
665572,33959,-0.021744
665572,35240,0.074920
665572,94437,-0.037213


In [49]:
df.loc[df.visitor_hist_adr_usd==0, 'visitor_hist_adr_usd'] = pd.NA
df['visitor_hist_adr_usd_logdiff'] = np.log(df['price_usd']) - np.log(df['visitor_hist_adr_usd'])

In [50]:
norm_visitor_price_hist_advantage_per_srch_id = make_aggregate_feature(['srch_id'], 'visitor_hist_adr_usd_logdiff', transform='diff', use_median=True)
norm_visitor_price_hist_advantage_per_srch_id

0.95 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,visitor_hist_adr_usd_logdiff_d_srch_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,
1,10404,
1,21315,
1,27348,
1,29604,
...,...,...
665572,32019,
665572,33959,
665572,35240,
665572,94437,


# country

In [51]:
df['same_country'] = (df['prop_country_id'] == df['visitor_location_country_id'])

# srch_query_affinity_score

In [52]:
norm_srch_query_affinity_score = make_aggregate_feature(['srch_id'], 'srch_query_affinity_score', transform='diff', use_median=True)

0.93 NAs


# Comments on aggregation over all ids (not just the training set)

When calculating averages over srch_destination_id or prop_id for instance, there is a significant number of them only encountered in the test set. Pedantically speaking we shouldn't use data from the test set in our model. However to get the best possible result in the competition we are going to leverage it.

If we would calculate means over ids from the training set only, then we would have NAs when predicting on the test set (there would be no NAs on the training set). At the very least, the validation set should have the same NA characteristics as the test set, but we can do better. We can remove aggregated values for part of the ids from the training set, thus the model learns not to rely on them too much (not more than what will be available in the test set) - make sure to discard only from the ids which don't appear in the test set or add the aggregated values back to the test set before prediction. This is expected to perform worse than using the statistics from the test set.

# using booking and click probabilities per property

## booking_prob_per_srch_res_count - not really interesting, more of an exercise

In [53]:
df_train.groupby('srch_id').agg(impression_count = ('prop_id' , 'count')).impression_count

srch_id
1         28
4         32
6          5
8         21
11        33
          ..
332777    32
332781    15
332782    24
332784    28
332785     6
Name: impression_count, Length: 199795, dtype: int64

In [54]:
# booking probability per number of impressions
# get the number of impressions per search_id
imp_cnt_per_srch_id = df_train.groupby('srch_id').agg(impression_count = ('prop_id' , 'count'), bookings = ('booking_bool', 'sum'), clicks=('click_bool', 'sum'))
imp_cnt_per_srch_id['total_impressions'] = imp_cnt_per_srch_id.impression_count.copy()
imp_cnt_per_srch_id

Unnamed: 0_level_0,impression_count,bookings,clicks,total_impressions
srch_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,28,1,1,28
4,32,0,1,32
6,5,1,1,5
8,21,1,1,21
11,33,0,1,33
...,...,...,...,...
332777,32,0,1,32
332781,15,1,1,15
332782,24,1,2,24
332784,28,1,1,28


In [55]:
stats_per_impression_cnt = imp_cnt_per_srch_id.groupby('impression_count').sum()
stats_per_impression_cnt['bookings'] /= stats_per_impression_cnt.total_impressions
stats_per_impression_cnt['clicks'] /= stats_per_impression_cnt.total_impressions
stats_per_impression_cnt
# we can see that the prob of booking and clicks decreases with the number of results per srch_id
# the longer the list, the less likely it will lead to a booking (maybe the search was more general)

Unnamed: 0_level_0,bookings,clicks,total_impressions
impression_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,0.131256,0.210762,22300
6,0.112081,0.175663,26454
7,0.095847,0.154487,28462
8,0.083125,0.134899,32024
9,0.074773,0.119891,35307
10,0.068663,0.107481,37560
11,0.061995,0.099487,40568
12,0.05705,0.092278,43488
13,0.05189,0.08369,47138
14,0.048917,0.077815,47754


In [56]:
# now we calculate the impression count for the entire dataset
df_with_impression_count = df[['srch_id', 'prop_id']].groupby('srch_id').agg(impression_count = ('prop_id' , 'count'))
df_with_impression_count = df_with_impression_count.reset_index().set_index('impression_count').join(stats_per_impression_cnt[['clicks', 'bookings']]).reset_index().set_index('srch_id')
assert set(df_with_impression_count.index.unique()) == set(df.srch_id.unique())
df_with_impression_count

Unnamed: 0_level_0,impression_count,clicks,bookings
srch_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,5,0.210762,0.131256
67,5,0.210762,0.131256
218,5,0.210762,0.131256
221,5,0.210762,0.131256
275,5,0.210762,0.131256
...,...,...,...
394059,38,0.026316,0.026316
483779,38,0.026316,0.026316
524632,38,0.026316,0.026316
561498,38,0.026316,0.026316


In [57]:
# bc probability per prop_id
tmp = df_train.groupby('prop_id').agg(impression_count = ('site_id' , 'count'), bookings = ('booking_bool', 'sum'), clicks=('click_bool', 'sum'))
tmp['booking_prob_per_prop_id'] = tmp.bookings / tmp.impression_count
tmp['click_prob_per_prop_id'] = tmp.clicks / tmp.impression_count
tmp

Unnamed: 0_level_0,impression_count,bookings,clicks,booking_prob_per_prop_id,click_prob_per_prop_id
prop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,61,0,1,0.000000,0.016393
2,11,1,1,0.090909,0.090909
3,82,2,2,0.024390,0.024390
4,22,1,1,0.045455,0.045455
5,30,0,2,0.000000,0.066667
...,...,...,...,...,...
140817,3,0,0,0.000000,0.000000
140818,3,0,0,0.000000,0.000000
140819,2,0,0,0.000000,0.000000
140820,18,0,0,0.000000,0.000000


In [58]:
# standardize this per search_id
#df.drop(columns=['prop_booking_prob', 'prop_click_prob'], inplace=True)
df = df.set_index('prop_id').join(tmp[['booking_prob_per_prop_id', 'click_prob_per_prop_id']], how='left').reset_index()

In [59]:
norm_booking_prob_per_prop_id = make_aggregate_feature('srch_id', col='booking_prob_per_prop_id', transform='diff', use_median=True)
norm_booking_prob_per_prop_id

0.00069 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,booking_prob_per_prop_id_d_srch_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,-0.001445
1,10404,-0.002348
1,21315,-0.014156
1,27348,-0.004742
1,29604,0.018305
...,...,...
665572,32019,0.000000
665572,33959,0.100000
665572,35240,0.000000
665572,94437,0.000000


In [60]:
norm_click_prob_per_prop_id = make_aggregate_feature('srch_id', col='click_prob_per_prop_id', transform='diff', use_median=True)
norm_click_prob_per_prop_id

0.00069 NAs


Unnamed: 0_level_0,Unnamed: 1_level_0,click_prob_per_prop_id_d_srch_id
srch_id,prop_id,Unnamed: 2_level_1
1,893,-0.000321
1,10404,-0.000736
1,21315,-0.017390
1,27348,-0.002552
1,29604,0.024663
...,...,...
665572,32019,0.000000
665572,33959,0.100000
665572,35240,0.000000
665572,94437,0.166667


# Join all dataframes together

In [61]:
joined = df.set_index(['srch_id', 'prop_id'])
from tqdm import tqdm
for to_join in tqdm([
    norm_price_per_srch,
    tosub_prop_starrating_per_dest,# prop_starrating_w0_d_srch_destination_id
    norm_prop_starrating_per_srch, # prop_starrating_w0_d_srch_id

    tosub_prop_review_score_per_dest, # prop_review_score_d_srch_destination_id
    tosub_prop_review_score_per_dest_prop_starrating, # prop_review_score_d_srch_destination_id__prop_starrating
    norm_prop_review_score_per_srch, # prop_review_score_d_srch_id

    tosub_prop_location_score1_per_dest, # prop_location_score1_d_srch_destination_id
    tosub_prop_location_score1_per_dest_prop_starrating, # prop_location_score1_d_srch_destination_id__prop_starrating
    norm_prop_location_score1_per_srch, # prop_location_score1_d_srch_id

    tosub_prop_location_score2_per_dest_id, # prop_location_score2_d_srch_destination_id
    tosub_prop_location_score2_per_dest_id_prop_starrating, # prop_location_score2_d_srch_destination_id__prop_starrating
    norm_prop_location_score2_per_srch, # prop_location_score2_d_srch_id

    norm_price_per_search_prop_starrating,
    norm_booking_prob_per_prop_id, norm_click_prob_per_prop_id,

    norm_prop_price_hist_advantage_per_srch_id,
    norm_visitor_price_hist_advantage_per_srch_id,
    norm_srch_query_affinity_score,
    ]):
    joined = joined.join(to_join, how='left')



100%|██████████| 18/18 [00:43<00:00,  2.42s/it]


In [62]:
# -1 to keep the sign
# we want to have the differences between the values relative to the search mean, and relative to the destination mean (someone might be looking at the 4 star hotels in an area predominantly with 3 star hotels, this might say something about the booking behaviour that cannot be seened simply by looking at the relative search orderings).

joined.prop_starrating_w0_d_srch_destination_id -= joined.prop_starrating_w0_d_srch_id * -1

joined.prop_review_score_d_srch_destination_id -= joined.prop_review_score_d_srch_id * -1
joined.prop_review_score_d_srch_destination_id__prop_starrating -= joined.prop_review_score_d_srch_id * -1

joined.prop_location_score1_d_srch_destination_id -= joined.prop_location_score1_d_srch_id * -1
joined.prop_location_score1_d_srch_destination_id__prop_starrating -= joined.prop_location_score1_d_srch_id * -1

joined.prop_location_score2_d_srch_destination_id -= joined.prop_location_score2_d_srch_id * -1
joined.prop_location_score2_d_srch_destination_id__prop_starrating -= joined.prop_location_score2_d_srch_id * -1

joined

Unnamed: 0_level_0,Unnamed: 1_level_0,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool,is_test,prop_starrating_w0,booking_week,booking_month,booking_dayofyear,booking_dayofweek,midstay_week,midstay_month,midstay_dayofyear,midstay_dayofweek,hist_starrating_diff,price_hist_logdiff,visitor_hist_adr_usd_logdiff,same_country,booking_prob_per_prop_id,click_prob_per_prop_id,price_usd_ld_srch_id,prop_starrating_w0_d_srch_destination_id,prop_starrating_w0_d_srch_id,prop_review_score_d_srch_destination_id,prop_review_score_d_srch_destination_id__prop_starrating,prop_review_score_d_srch_id,prop_location_score1_d_srch_destination_id,prop_location_score1_d_srch_destination_id__prop_starrating,prop_location_score1_d_srch_id,prop_location_score2_d_srch_destination_id,prop_location_score2_d_srch_destination_id__prop_starrating,prop_location_score2_d_srch_id,price_usd_ld_srch_id__prop_starrating,booking_prob_per_prop_id_d_srch_id,click_prob_per_prop_id_d_srch_id,price_hist_logdiff_d_srch_id,visitor_hist_adr_usd_logdiff_d_srch_id,srch_query_affinity_score_d_srch_id
srch_id,prop_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1
427,1,5,219,,,219,2,,1,3.04,,4.28,21.0,72.00,0,6475,1,10,2,0,1,1,,1,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,,0.0,0.0,False,2.0,6,2,37,2,7,2,47,5,,-0.003334,,True,0.000000,0.016393,-0.067139,-1.5,-0.5,,,,1.20,1.040,0.560,,,,0.125626,-0.015505,-0.011613,0.218860,,
5762,1,5,219,3.5,147.02,219,2,,1,3.04,,,33.0,99.00,0,6475,1,72,2,0,1,1,,0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,2.0,24,6,164,3,34,8,236,5,1.5,,-0.395449,True,0.000000,0.016393,-0.114410,-2.0,-1.0,,,,1.12,0.960,0.480,,,,0.129212,-0.017857,-0.014857,,-0.11441,
8178,1,5,219,,,219,2,,1,3.04,,4.30,34.0,89.00,0,6475,1,0,2,0,1,1,,0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,2.0,46,11,322,5,46,11,322,5,,0.188636,,True,0.000000,0.016393,-0.065241,-2.0,-1.0,,,,0.85,0.690,0.210,,,,0.198177,-0.015625,-0.010634,0.283944,,
8465,1,5,219,,,219,2,,1,3.04,,4.28,30.0,81.00,0,6475,2,4,2,1,1,1,,0,,,,0.0,0.0,,,,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0.0,0.0,False,2.0,15,4,98,0,15,4,103,5,,0.114449,,True,0.000000,0.016393,0.000000,-1.5,-0.5,,,,1.28,1.120,0.640,,,,0.044171,-0.015625,-0.010634,0.269329,,
10771,1,5,219,,,219,2,,1,3.04,,4.57,37.0,109.00,0,6475,1,4,1,0,1,0,,0,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0.0,0.0,False,2.0,19,5,128,2,19,5,132,6,,0.121348,,True,0.000000,0.016393,0.116637,-2.0,-1.0,,,,1.28,1.120,0.640,,,,0.450021,-0.016393,-0.014857,0.247003,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167454,140821,16,31,,,31,4,5.0,0,0.69,,4.55,22.0,60.80,0,18417,7,2,1,0,1,0,,1,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,4.0,6,2,35,0,6,2,40,5,,-0.442410,,True,0.142857,0.142857,-0.095385,0.0,0.0,1.0,1.00,0.50,-0.82,-1.075,-0.410,,,,-0.312395,0.142857,0.142857,-0.247723,,
232724,140821,15,55,,,31,4,5.0,0,0.69,0.0095,4.38,8.0,76.35,0,13509,1,64,1,0,1,1,,0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,4.0,17,4,114,2,26,6,178,3,,-0.044672,,False,0.142857,0.142857,0.040904,0.0,0.0,1.0,1.50,0.50,-2.02,-2.020,-0.920,-0.0246,-0.04265,-0.0123,0.000000,0.142857,0.142857,0.213410,,
284828,140821,16,31,,,31,4,5.0,0,0.69,,4.39,12.0,83.58,0,10432,4,5,2,0,1,0,,0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,0.0,0.0,False,4.0,9,2,59,3,10,3,66,3,,0.035804,,True,0.142857,0.142857,0.047974,0.0,0.0,1.5,1.75,0.75,-1.33,-1.765,-0.665,,,,0.015737,0.142857,0.087302,0.105863,,
473794,140821,16,31,,,31,4,5.0,0,0.69,0.0659,4.38,,70.26,0,23788,1,2,2,0,1,1,,0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,True,4.0,17,4,115,3,17,4,117,5,,-0.127797,,True,0.142857,0.142857,-0.245048,0.0,0.0,1.0,1.00,0.50,-0.82,-0.820,-0.410,0.0000,-0.03080,0.0000,-0.273708,0.142857,0.142857,-0.042829,,


# Save the new dataframe

In [63]:
#joined.reset_index().to_csv('data/joined_all_features_revealed.csv.zip')
joined.reset_index().to_csv('data/joined_all_features.csv.zip')