In [2]:
import os
os.chdir('/Users/francopiccolo/GitHub/learn-to-rank')

In [3]:
import pandas as pd

properties = pd.read_csv('./data/properties.csv')
reviews = pd.read_csv('./data/reviews.csv')
queries = pd.read_csv('./data/queries.csv')
interactions = pd.read_csv('./data/interactions.csv')

In [5]:
interactions.interaction.unique()

array(['CLICK', 'VIEW', 'BOOKING'], dtype=object)

In [56]:
queries['day'] = pd.to_datetime(queries['date_time']).dt.day

In [9]:
map_interaction_value = {'VIEW': 0, 'CLICK': 1, 'BOOKING': 2}
interactions['interaction_value'] = interactions['interaction'].replace(map_interaction_value)

In [20]:
reviews_agg = reviews.groupby('property_id').mean().reset_index()

In [57]:
properties['city_id_property'] = properties['city_id']
queries['city_id_query'] = queries['city_id']
df = interactions.merge(properties, on = 'property_id')
df = df.merge(reviews_agg, on= 'property_id')
df = df.merge(queries, on='query_id')

In [55]:
df = df.sort_values(by=['query_id', 'position'])

In [71]:
query_sizes = df.groupby('query_id').size().reset_index().rename({0: 'size'}, axis=1)

In [35]:
df.columns

Index(['query_id', 'property_id', 'position', 'interaction', 'dorm_price',
       'private_price', 'interaction_value', 'city_id_x', 'country_id',
       'property_type', 'distance_to_center', 'city_id_property',
       'security_score', 'location_score', 'staff_score', 'fun_score',
       'cleanliness_score', 'value_for_money_score', 'facilities_score',
       'device_type', 'language', 'checkin_date', 'checkout_date', 'nr_guests',
       'city_id_y', 'date_time', 'city_id_query'],
      dtype='object')

In [63]:
features = ['dorm_price', 'private_price', 
            'security_score', 'location_score', 'staff_score', 'fun_score',          # reviews (ignoring date_stayed, age_group, trip_type)
            'cleanliness_score', 'value_for_money_score', 'facilities_score',
            'device_type', 'language', 'nr_guests', 'city_id_query',                 # query (ignoring checkin/out, time)
            'country_id', 'city_id_property', 'property_type', 'distance_to_center'] # property

categorical_features = ['device_type', 'language', 'city_id_query',
                        'country_id', 'city_id_property', 'property_type']

label = 'interaction_value'

for feature in categorical_features:
    df[feature + '_name'] = df[feature]
    df[feature] = df[feature].astype('category').cat.codes

In [77]:
train_mask = df['day'] < 25
val_mask = (df['day'] >= 25) & (df['day'] < 28)
test_mask = (df['day'] >= 28) & (df['day'] < 31)
train_df = df[train_mask]
val_df = df[val_mask]
test_df = df[test_mask]
train_query_sizes = query_sizes['size'][query_sizes['query_id'].isin(train_df['query_id'])]
val_query_sizes = query_sizes['size'][query_sizes['query_id'].isin(val_df['query_id'])]
test_query_sizes = query_sizes['size'][query_sizes['query_id'].isin(test_df['query_id'])]

In [83]:
import lightgbm as lgb
train_ds = lgb.Dataset(train_df[features], label=train_df[label], categorical_feature=categorical_features, group=train_query_sizes)
val_ds = lgb.Dataset(val_df[features], label=val_df[label], categorical_feature=categorical_features, group=val_query_sizes, reference=train_ds)


In [85]:
train_ds.save_binary('./data/train_ds.bin')
val_ds.save_binary('./data/val_ds.bin')


[LightGBM] [Info] Saving data to binary file ./data/train_ds.bin
[LightGBM] [Info] Saving data to binary file ./data/val_ds.bin
[LightGBM] [Info] Saving data to binary file ./data/test_ds.bin


<lightgbm.basic.Dataset at 0x1158f9e48>