In [64]:
# Import libraries
from collections import Counter, OrderedDict
from itertools import chain
from keras.layers import Input, Embedding, Dot, Reshape, Dense
from keras.models import Model
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 500)
pd.options.display.max_colwidth = 1000
import random
import seaborn as sns
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

In [3]:
# Import csv file
properties_path = "../data/properties_and_cities.csv"
metadata_path = "../data/item_metadata.csv"
train_path = "../data/train.csv"

In [4]:
# Create DataFrames
trivago_df = pd.read_csv(train_path)

In [5]:
properties_df = pd.read_csv(properties_path, usecols = ["item_id", "properties", "city"])

In [21]:
metadata_df = pd.read_csv(metadata_path)

In [7]:
item_list = metadata_df["item_id"].tolist()

In [42]:
# Create indexes for each item
item_index = {item: idx for idx, item in enumerate(item_list)}
index_item = {idx: item for item, idx in item_index.items()}

In [43]:
item_list[:10]

[5101, 5416, 5834, 5910, 6066, 6094, 6288, 6358, 6456, 6561]

In [22]:
metadata_df["properties"] = metadata_df["properties"].str.split("|")

In [9]:
metadata_df.head()

Unnamed: 0,item_id,properties
0,5101,"[Satellite TV, Golf Course, Airport Shuttle, Cosmetic Mirror, Safe (Hotel), Telephone, Hotel, Sitting Area (Rooms), Reception (24/7), Air Conditioning, Hypoallergenic Rooms, Cable TV, Hotel Bar, Pool Table, Bathtub, Satisfactory Rating, Room Service, Luxury Hotel, Terrace (Hotel), Television, Minigolf, Business Hotel, Shower, Cot, Gym, Hairdryer, Hypoallergenic Bedding, Accessible Parking, From 3 Stars, Good Rating, Radio, 4 Star, From 4 Stars, Family Friendly, Desk, Tennis Court (Indoor), Balcony, WiFi (Public Areas), Openable Windows, Express Check-In / Check-Out, Restaurant, Laundry Service, Ironing Board, Tennis Court, From 2 Stars, Business Centre, Bowling, Conference Rooms, Electric Kettle, Accessible Hotel, Porter, Bike Rental, Non-Smoking Rooms, Car Park, Safe (Rooms), Fitness, Fan, Flatscreen TV, Computer with Internet, WiFi (Rooms), Lift, Central Heating]"
1,5416,"[Satellite TV, Cosmetic Mirror, Safe (Hotel), Telephone, Hotel, Sitting Area (Rooms), Reception (24/7), Wheelchair Accessible, Hypoallergenic Rooms, Hotel Bar, Bathtub, Satisfactory Rating, Luxury Hotel, Terrace (Hotel), Very Good Rating, Television, Business Hotel, Shower, Cot, Hairdryer, From 3 Stars, Good Rating, Radio, 4 Star, From 4 Stars, Family Friendly, Desk, WiFi (Public Areas), Openable Windows, Spa (Wellness Facility), Laundry Service, Free WiFi (Combined), From 2 Stars, Conference Rooms, Sauna, Bike Rental, Free WiFi (Rooms), Non-Smoking Rooms, Car Park, Flatscreen TV, Excellent Rating, Computer with Internet, Pet Friendly, WiFi (Rooms), Free WiFi (Public Areas), Lift]"
2,5834,"[Satellite TV, Cosmetic Mirror, Safe (Hotel), Telephone, Hotel, Reception (24/7), Satisfactory Rating, Hiking Trail, Luxury Hotel, Terrace (Hotel), Very Good Rating, Minigolf, Business Hotel, Shower, Cot, Hairdryer, Beach, From 3 Stars, Good Rating, Family Friendly, Desk, WiFi (Public Areas), Openable Windows, Free WiFi (Combined), Boat Rental, Gay-friendly, From 2 Stars, Bowling, 3 Star, Free WiFi (Rooms), Non-Smoking Rooms, Car Park, Safe (Rooms), Flatscreen TV, Singles, Computer with Internet, WiFi (Rooms), Free WiFi (Public Areas), Lift, Central Heating]"
3,5910,"[Satellite TV, Sailing, Cosmetic Mirror, Telephone, Hotel, Cable TV, Hotel Bar, Bathtub, Satisfactory Rating, Room Service, Luxury Hotel, Terrace (Hotel), Television, Business Hotel, Shower, From 3 Stars, Good Rating, Radio, 4 Star, From 4 Stars, Family Friendly, Tennis Court (Indoor), WiFi (Public Areas), Openable Windows, Restaurant, Laundry Service, Free WiFi (Combined), Tennis Court, From 2 Stars, Solarium, Conference Rooms, Bike Rental, Non-Smoking Rooms, Car Park, Concierge, Safe (Rooms), Computer with Internet, Pet Friendly, Free WiFi (Public Areas), Lift, Central Heating]"
4,6066,"[Satellite TV, Sailing, Diving, Cosmetic Mirror, Safe (Hotel), Telephone, Hotel, Sitting Area (Rooms), Reception (24/7), Wheelchair Accessible, Hypoallergenic Rooms, Cable TV, Massage, Hotel Bar, Pool Table, Bathtub, Satisfactory Rating, Room Service, Luxury Hotel, Terrace (Hotel), Towels, Television, Business Hotel, Shower, Steam Room, Spa Hotel, Swimming Pool (Outdoor), Cot, Gym, Hairdryer, Beach, Hypoallergenic Bedding, Beauty Salon, Accessible Parking, From 3 Stars, Convention Hotel, Good Rating, Radio, 4 Star, From 4 Stars, Table Tennis, Family Friendly, Desk, Tennis Court (Indoor), Balcony, WiFi (Public Areas), Surfing, Hot Stone Massage, Openable Windows, Spa (Wellness Facility), Restaurant, Laundry Service, Ironing Board, Free WiFi (Combined), Tennis Court, Romantic, Boat Rental, From 2 Stars, Business Centre, Solarium, Bowling, Conference Rooms, Sauna, Hammam, Accessible Hotel, Bike Rental, Free WiFi (Rooms), Non-Smoking Rooms, Playground, Car Park, Safe (Rooms), Fitness, ..."


In [23]:
# Convert all properties to lowercase
metadata_df["properties"] = metadata_df["properties"].apply(lambda x: [w.lower() for w in x])

In [24]:
metadata_df.head()

Unnamed: 0,item_id,properties
0,5101,"[satellite tv, golf course, airport shuttle, cosmetic mirror, safe (hotel), telephone, hotel, sitting area (rooms), reception (24/7), air conditioning, hypoallergenic rooms, cable tv, hotel bar, pool table, bathtub, satisfactory rating, room service, luxury hotel, terrace (hotel), television, minigolf, business hotel, shower, cot, gym, hairdryer, hypoallergenic bedding, accessible parking, from 3 stars, good rating, radio, 4 star, from 4 stars, family friendly, desk, tennis court (indoor), balcony, wifi (public areas), openable windows, express check-in / check-out, restaurant, laundry service, ironing board, tennis court, from 2 stars, business centre, bowling, conference rooms, electric kettle, accessible hotel, porter, bike rental, non-smoking rooms, car park, safe (rooms), fitness, fan, flatscreen tv, computer with internet, wifi (rooms), lift, central heating]"
1,5416,"[satellite tv, cosmetic mirror, safe (hotel), telephone, hotel, sitting area (rooms), reception (24/7), wheelchair accessible, hypoallergenic rooms, hotel bar, bathtub, satisfactory rating, luxury hotel, terrace (hotel), very good rating, television, business hotel, shower, cot, hairdryer, from 3 stars, good rating, radio, 4 star, from 4 stars, family friendly, desk, wifi (public areas), openable windows, spa (wellness facility), laundry service, free wifi (combined), from 2 stars, conference rooms, sauna, bike rental, free wifi (rooms), non-smoking rooms, car park, flatscreen tv, excellent rating, computer with internet, pet friendly, wifi (rooms), free wifi (public areas), lift]"
2,5834,"[satellite tv, cosmetic mirror, safe (hotel), telephone, hotel, reception (24/7), satisfactory rating, hiking trail, luxury hotel, terrace (hotel), very good rating, minigolf, business hotel, shower, cot, hairdryer, beach, from 3 stars, good rating, family friendly, desk, wifi (public areas), openable windows, free wifi (combined), boat rental, gay-friendly, from 2 stars, bowling, 3 star, free wifi (rooms), non-smoking rooms, car park, safe (rooms), flatscreen tv, singles, computer with internet, wifi (rooms), free wifi (public areas), lift, central heating]"
3,5910,"[satellite tv, sailing, cosmetic mirror, telephone, hotel, cable tv, hotel bar, bathtub, satisfactory rating, room service, luxury hotel, terrace (hotel), television, business hotel, shower, from 3 stars, good rating, radio, 4 star, from 4 stars, family friendly, tennis court (indoor), wifi (public areas), openable windows, restaurant, laundry service, free wifi (combined), tennis court, from 2 stars, solarium, conference rooms, bike rental, non-smoking rooms, car park, concierge, safe (rooms), computer with internet, pet friendly, free wifi (public areas), lift, central heating]"
4,6066,"[satellite tv, sailing, diving, cosmetic mirror, safe (hotel), telephone, hotel, sitting area (rooms), reception (24/7), wheelchair accessible, hypoallergenic rooms, cable tv, massage, hotel bar, pool table, bathtub, satisfactory rating, room service, luxury hotel, terrace (hotel), towels, television, business hotel, shower, steam room, spa hotel, swimming pool (outdoor), cot, gym, hairdryer, beach, hypoallergenic bedding, beauty salon, accessible parking, from 3 stars, convention hotel, good rating, radio, 4 star, from 4 stars, table tennis, family friendly, desk, tennis court (indoor), balcony, wifi (public areas), surfing, hot stone massage, openable windows, spa (wellness facility), restaurant, laundry service, ironing board, free wifi (combined), tennis court, romantic, boat rental, from 2 stars, business centre, solarium, bowling, conference rooms, sauna, hammam, accessible hotel, bike rental, free wifi (rooms), non-smoking rooms, playground, car park, safe (rooms), fitness, ..."


In [25]:
properties_list = metadata_df["properties"].tolist()

In [44]:
properties_list[:10]

[['satellite tv',
  'golf course',
  'airport shuttle',
  'cosmetic mirror',
  'safe (hotel)',
  'telephone',
  'hotel',
  'sitting area (rooms)',
  'reception (24/7)',
  'air conditioning',
  'hypoallergenic rooms',
  'cable tv',
  'hotel bar',
  'pool table',
  'bathtub',
  'satisfactory rating',
  'room service',
  'luxury hotel',
  'terrace (hotel)',
  'television',
  'minigolf',
  'business hotel',
  'shower',
  'cot',
  'gym',
  'hairdryer',
  'hypoallergenic bedding',
  'accessible parking',
  'from 3 stars',
  'good rating',
  'radio',
  '4 star',
  'from 4 stars',
  'family friendly',
  'desk',
  'tennis court (indoor)',
  'balcony',
  'wifi (public areas)',
  'openable windows',
  'express check-in / check-out',
  'restaurant',
  'laundry service',
  'ironing board',
  'tennis court',
  'from 2 stars',
  'business centre',
  'bowling',
  'conference rooms',
  'electric kettle',
  'accessible hotel',
  'porter',
  'bike rental',
  'non-smoking rooms',
  'car park',
  'safe (ro

In [30]:
# Find set of unique properties and convert to a list
unique_properties = list(chain(*[list(set(tags)) for tags in properties_list]))

In [32]:
# Count unique properties
def count_items(l):
    
    # Create a counter object
    counts = Counter(l)
    
    # Sort by highest count first and place in ordered dictionary
    counts = sorted(counts.items(), key = lambda x: x[1], reverse = True)
    counts = OrderedDict(counts)
    
    return counts

properties_count = count_items(unique_properties)

In [35]:
# Sanity check
list(properties_count.items())[:10]

[('satisfactory rating', 533286),
 ('car park', 487879),
 ('good rating', 481910),
 ('wifi (rooms)', 467027),
 ('shower', 426875),
 ('television', 425953),
 ('wifi (public areas)', 399547),
 ('hotel', 379321),
 ('very good rating', 376666),
 ('air conditioning', 353296)]

In [37]:
# Create indexes for each property
property_index = {tag: idx for idx, tag in enumerate(unique_properties)}
index_properties = {idx: tag for tag, idx in property_index.items()}

In [45]:
# Build item properties pair to train embedding neural network
item_property_pairs = []

# Iterate through each row of dataframe
for index, row in metadata_df.iterrows():
    # Iterate through the properties in the item
    item_property_pairs.extend((item_index[metadata_df.at[index, "item_id"]], property_index[tag.lower()]) for tag in 
                               metadata_df.at[index, "properties"] if tag.lower() in unique_properties)

In [46]:
# Sanity check
len(item_property_pairs)

18260819

In [58]:
# Sanity check
index_item[item_property_pairs[0][0]], index_properties[item_property_pairs[0][1]]

(5101, 'satellite tv')

In [60]:
# Generator for training samples
def generate_batch(pairs, n_positive = 50, negative_ratio = 1.0):
    
    pairs_set = set(pairs)
    
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    # Label for negative examples
    neg_label = 0
    
    # This creates a generator
    while True:
        # randomly choose positive examples
        for idx, (item_id, property_id) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (item_id, property_id, 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples until reach batch size
        while idx < batch_size:
            
            # random selection
            random_item = random.randrange(len(item_list))
            random_property = random.randrange(len(unique_links))
            
            # Check to make sure this is not a positive example
            if (random_item, random_property) not in pairs_set:
                
                # Add to batch and increment index
                batch[idx, :] = (random_item, random_property, neg_label)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {"item": batch[:, 0], "property": batch[:, 1]}, batch[:, 2]

In [63]:
# Properties embedding model
def hotel_embeddings(embedding_size = 50):
    
    # Inputs are one-dimensional
    item = Input(name = "item", shape = [1])
    tag = Input(name = "property", shape = [1])
    
    # Embedding the item
    item_embedding = Embedding(name = "item_embedding", input_dim = len(item_index), 
                              output_dim = embedding_size)(item)
    
    # Embedding the properties
    property_embedding = Embedding(name = "property_embedding", input_dim = len(property_index),
                                  output_dim = embedding_size)(tag)
    
    # Merge the embeddings with dot product across second axis
    merged = Dot(name = "dot_product", normalize = True, axes = 2)([item_embedding, property_embedding])
    
    # Reshape to get a single number
    merged = Reshape(target_shape = [1])(merged)
    
    merged = Dense(1, activation = "sigmoid")(merged)
    model = Model(inputs = [item, tag], outputs = merged)
    model.compile(optimizer = "Adam", loss = "binary_crossentropy", metrics = ["accuracy"])
    
    return model

# Instantiate model and show parameters
model = hotel_embeddings()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
property (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 50)        46357100    item[0][0]                       
__________________________________________________________________________________________________
property_embedding (Embedding)  (None, 1, 50)        7850        property[0][0]                   
__________________________________________________________________________________________________
dot_produc

In [None]:
# Train Model

n_positive = 2000

# Create generator 
generator = generate_batch(item_property_pairs, n_positive, negative_ratio = 1)

# Train

item_property = model.fit_generator(generator, epochs = 100, 
                                    steps_per_epoch = len(item_property_pairs) // n_positive, verbose = 2)

In [None]:
# Save model
model.save("../models/embeddings_first_attempt.h5")
model.save_weights("../models/embeddings_first_attempt_weights.h5")

In [None]:
# Extract embeddings
hotel_layer = model.get_layer("item_embedding")
hotel_weights = hotel_layer.get_weights()[0]

# Normalize the embeddings so that we can calculate cosine similarity
hotel_weights = hotel_weights / np.linalg.norm(hotel_weights, axis = 1).reshape((-1, 1))