In [144]:
import os
os.chdir('/Users/francopiccolo/GitHub/learn-to-rank')

In [145]:
import pandas as pd

properties = pd.read_csv('./data/properties.csv')
reviews = pd.read_csv('./data/reviews.csv')
queries = pd.read_csv('./data/queries.csv')
interactions = pd.read_csv('./data/interactions.csv')

In [146]:
queries['day'] = pd.to_datetime(queries['date_time']).dt.day
properties['property_city_id'] = properties['city_id']
properties.drop('city_id', axis=1, inplace=True)

In [147]:
map_interaction_value = {'VIEW': 0, 'CLICK': 1, 'BOOKING': 2}
interactions['interaction_value'] = interactions['interaction'].replace(map_interaction_value)

In [148]:
reviews_agg = reviews.groupby('property_id').mean().reset_index()

In [149]:
df_properties = properties.merge(reviews_agg, on='property_id')
df = df_properties.merge(interactions, on='property_id')
df = df.merge(queries, on='query_id')

In [150]:
df = df.sort_values(by=['query_id', 'position'])

In [151]:
query_sizes = df.groupby('query_id').size().reset_index().rename({0: 'size'}, axis=1)

In [141]:
df.columns

Index(['query_id', 'property_id', 'position', 'interaction', 'dorm_price',
       'private_price', 'interaction_value', 'country_id', 'property_type',
       'distance_to_center', 'property_city_id', 'security_score',
       'location_score', 'staff_score', 'fun_score', 'cleanliness_score',
       'value_for_money_score', 'facilities_score', 'device_type', 'language',
       'checkin_date', 'checkout_date', 'nr_guests', 'city_id', 'date_time',
       'day'],
      dtype='object')

In [152]:
query_features = ['city_id', 'device_type', 'language', 'nr_guests'] # query (ignoring checkin/out, time)
property_features = [
    'property_type', 'distance_to_center',              # property
    'dorm_price', 'private_price',                      # interaction
    'security_score', 'location_score', 'staff_score',  # reviews (ignoring date_stayed, age_group, trip_type)
    'fun_score', 'cleanliness_score',                   
    'value_for_money_score', 'facilities_score'
]
features = query_features + property_features
            
categorical_features = ['device_type', 'language']
map_category_code = {}
for feature in categorical_features:
    df[feature + '_name'] = df[feature]
    df[feature  + '_categorical'] = df[feature].astype('category')
    df[feature] = df[feature + '_categorical'].cat.codes
    map_category_code[feature] = {v:k for k, v in enumerate(df[feature  + '_categorical'].cat.categories)}

label = 'interaction_value'

In [155]:
df_properties['city_id'] = df_properties['property_city_id']
df_properties.drop('property_city_id', axis=1, inplace=True)

In [160]:
df_properties

Unnamed: 0,property_id,country_id,property_type,distance_to_center,security_score,location_score,staff_score,fun_score,cleanliness_score,value_for_money_score,facilities_score,city_id
0,2696,148,3,0.1,5.000000,5.000000,5.000000,5.000000,5.000000,5.000000,5.000000,2559
1,5489,108,1,2.3,4.642857,4.357143,4.809524,4.285714,4.571429,4.285714,4.404762,5199
2,21081,154,1,0.4,4.541844,4.747518,4.604255,4.069504,4.000000,4.062411,3.973050,290
3,4108,148,1,1.3,4.827160,4.820988,4.876543,4.802469,4.722222,4.808642,4.734568,2559
4,21578,83,1,1.1,4.658854,4.727865,4.765625,4.481771,4.582031,4.449219,4.377604,5492
...,...,...,...,...,...,...,...,...,...,...,...,...
15444,21427,201,4,13.1,5.000000,5.000000,5.000000,5.000000,5.000000,5.000000,5.000000,1054
15445,4990,146,1,2.6,4.140625,3.968750,4.765625,4.750000,4.296875,4.640625,4.312500,736
15446,18836,114,3,2.4,4.600000,4.800000,5.000000,4.600000,4.800000,5.000000,4.800000,4387
15447,15159,45,1,1.5,5.000000,4.750000,4.500000,4.000000,5.000000,4.500000,4.500000,10101


In [159]:
df_properties[df_properties['city_id'].isin([709, 7050])].to_pickle('./test/data/properties.pkl')

In [161]:
config = {
    'map_category_code': map_category_code,
    'query_features': query_features,
    'property_features': property_features,
    'categorical_features': categorical_features}

import pickle
file_name = './data/config.pkl'
with open(file_name, 'wb') as file:
    pickle.dump(config, file)

In [108]:
train_mask = df['day'] < 25
val_mask = (df['day'] >= 25) & (df['day'] < 28)
test_mask = (df['day'] >= 28) & (df['day'] < 31)
train_df = df[train_mask]
val_df = df[val_mask]
test_df = df[test_mask]
train_query_sizes = query_sizes['size'][query_sizes['query_id'].isin(train_df['query_id'])]
val_query_sizes = query_sizes['size'][query_sizes['query_id'].isin(val_df['query_id'])]
test_query_sizes = query_sizes['size'][query_sizes['query_id'].isin(test_df['query_id'])]

In [109]:
train_df[features].to_pickle('./data/X_train.pkl')
val_df[features].to_pickle('./data/X_val.pkl')
test_df[features].to_pickle('./data/X_test.pkl')
train_df[label].to_pickle('./data/y_train.pkl')
val_df[label].to_pickle('./data/y_val.pkl')
test_df[label].to_pickle('./data/y_test.pkl')

train_query_sizes.to_pickle('./data/train_query_sizes.pkl')
val_query_sizes.to_pickle('./data/val_query_sizes.pkl')
test_query_sizes.to_pickle('./data/test_query_sizes.pkl')

In [110]:
# backup -> dataset api
# import lightgbm as lgb
# train_ds = lgb.Dataset(train_df[features], label=train_df[label], categorical_feature=categorical_features, group=train_query_sizes)
# val_ds = lgb.Dataset(val_df[features], label=val_df[label], categorical_feature=categorical_features, group=val_query_sizes, reference=train_ds)
# train_ds.save_binary('./data/train_ds.bin')
# val_ds.save_binary('./data/val_ds.bin')