In [1]:
import os
os.chdir('/Users/francopiccolo/GitHub/learn-to-rank')

In [2]:
import pandas as pd

properties = pd.read_csv('./data/properties.csv')
reviews = pd.read_csv('./data/reviews.csv')
queries = pd.read_csv('./data/queries.csv')
interactions = pd.read_csv('./data/interactions.csv')

In [3]:
queries['day'] = pd.to_datetime(queries['date_time']).dt.day

In [4]:
map_interaction_value = {'VIEW': 0, 'CLICK': 1, 'BOOKING': 2}
interactions['interaction_value'] = interactions['interaction'].replace(map_interaction_value)

In [5]:
reviews_agg = reviews.groupby('property_id').mean().reset_index()

In [6]:
df = interactions.merge(properties, on = 'property_id')
df = df.merge(reviews_agg, on= 'property_id')
df = df.merge(queries, on='query_id')

In [7]:
df = df.sort_values(by=['query_id', 'position'])

In [8]:
query_sizes = df.groupby('query_id').size().reset_index().rename({0: 'size'}, axis=1)

In [9]:
df.columns

Index(['query_id', 'property_id', 'position', 'interaction', 'dorm_price',
       'private_price', 'interaction_value', 'city_id_x', 'country_id',
       'property_type', 'distance_to_center', 'security_score',
       'location_score', 'staff_score', 'fun_score', 'cleanliness_score',
       'value_for_money_score', 'facilities_score', 'device_type', 'language',
       'checkin_date', 'checkout_date', 'nr_guests', 'city_id_y', 'date_time',
       'day'],
      dtype='object')

In [10]:
features = ['device_type', 'language', 'nr_guests',             # query (ignoring checkin/out, time)
            'property_type', 'distance_to_center',              # property
            'dorm_price', 'private_price',                      # interaction
            'security_score', 'location_score', 'staff_score',  # reviews (ignoring date_stayed, age_group, trip_type)
            'fun_score', 'cleanliness_score',                   
            'value_for_money_score', 'facilities_score']
            
categorical_features = ['device_type', 'language', 'property_type']

for feature in categorical_features:
    df[feature + '_name'] = df[feature]
    df[feature] = df[feature].astype('category').cat.codes

label = 'interaction_value'

In [11]:
train_mask = df['day'] < 25
val_mask = (df['day'] >= 25) & (df['day'] < 28)
test_mask = (df['day'] >= 28) & (df['day'] < 31)
train_df = df[train_mask]
val_df = df[val_mask]
test_df = df[test_mask]
train_query_sizes = query_sizes['size'][query_sizes['query_id'].isin(train_df['query_id'])]
val_query_sizes = query_sizes['size'][query_sizes['query_id'].isin(val_df['query_id'])]
test_query_sizes = query_sizes['size'][query_sizes['query_id'].isin(test_df['query_id'])]

In [12]:
train_df[features].to_pickle('./data/X_train.pkl')
val_df[features].to_pickle('./data/X_val.pkl')
test_df[features].to_pickle('./data/X_test.pkl')
train_df[label].to_pickle('./data/y_train.pkl')
val_df[label].to_pickle('./data/y_val.pkl')
test_df[label].to_pickle('./data/y_test.pkl')

train_query_sizes.to_pickle('./data/train_query_sizes.pkl')
val_query_sizes.to_pickle('./data/val_query_sizes.pkl')
test_query_sizes.to_pickle('./data/test_query_sizes.pkl')

In [13]:
# backup -> dataset api
# import lightgbm as lgb
# train_ds = lgb.Dataset(train_df[features], label=train_df[label], categorical_feature=categorical_features, group=train_query_sizes)
# val_ds = lgb.Dataset(val_df[features], label=val_df[label], categorical_feature=categorical_features, group=val_query_sizes, reference=train_ds)
# train_ds.save_binary('./data/train_ds.bin')
# val_ds.save_binary('./data/val_ds.bin')