In [446]:
# Import libraries
from collections import Counter, OrderedDict
# import datawig
from itertools import chain
from keras.layers import Input, Embedding, Add, Reshape, Dense, Multiply, GlobalMaxPool1D
from keras.models import Model
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 500)
pd.options.display.max_colwidth = 1000
import random

In [474]:
# Path to file
hotels_path = "../data/hotels_com_scrape_v3.csv"

# Dataframe
hotels_df = pd.read_csv(hotels_path, header = None)

# Sanity check
hotels_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,"Mutterstadt, Germany",RheinCity Hotel,4.0,4,"Zollhofstraße 11, Ludwigshafen, RP, 67059, Germany",Ludwigshafen,"5.4 miles to Mutterstadt, 3.3 miles to Mannheim (MHG)",107.0,https://www.hotels.com/ho804141952/?q-check-out=2020-08-29&FPQ=3&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=1&tab=description&JHR=3&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
1,"Mutterstadt, Germany",NYX Hotel Mannheim by Leonardo Hotels,4.0,14,"F4, 4-11, Mannheim, BW, 68159, Germany",Mannheim,"6.2 miles to Mutterstadt, 2.8 miles to Mannheim (MHG)",78.0,https://www.hotels.com/ho408334/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=2&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
2,"Mutterstadt, Germany","Radisson Blu Hotel, Mannheim",4.5,387,"Quadrant Q7, 27, Mannheim, 68161, Germany",Mannheim,"6.4 miles to Mutterstadt, 2.3 miles to Mannheim (MHG)",na,https://www.hotels.com/ho626280/?pa=3&tab=description&q-room-0-adults=2&intlid=SoldOutListing&ZSX=0&SYE=3&q-room-0-children=0
3,"Mutterstadt, Germany",ACHAT Comfort Frankenthal/Pfalz,3.5,64,"Mahlastrasse 18, Frankenthal, RP, 67227, Germany",Frankenthal,"5.9 miles to Mutterstadt, 8.3 miles to Mannheim (MHG)",83.0,https://www.hotels.com/ho180949/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=4&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
4,"Mutterstadt, Germany",Relax Apartment,na,na,"F4 14-15, Mannheim, 68159, Germany",Mannheim,"6.2 miles to Mutterstadt, 2.7 miles to Mannheim (MHG)",na,https://www.hotels.com/ho685840/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=5&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3


In [475]:
# Rename columns
hotels_df.rename({0: "location", 1: "hotel_name", 2: "rating", 3: "popularity_rating", 
                      4: "address", 5: "locality", 6: "landmark", 
                      7: "price", 8: "URL"}, inplace = True, axis = 1)

In [476]:
# Check for null values
hotels_df.isna().sum()

location              0
hotel_name            0
rating                0
popularity_rating     0
address               0
locality              0
landmark             69
price                 0
URL                   0
dtype: int64

In [477]:
# Split location
location_df = hotels_df["location"].str.split(",", expand = True)

In [478]:
# Rename landmark columns
location_df.rename({0: "city", 1: "country"}, inplace = True, axis = 1)

In [479]:
location_df.head()

Unnamed: 0,city,country
0,Mutterstadt,Germany
1,Mutterstadt,Germany
2,Mutterstadt,Germany
3,Mutterstadt,Germany
4,Mutterstadt,Germany


In [480]:
# Merge landmarks_df with hotels_df
hotels_df = pd.merge(location_df, hotels_df[["hotel_name", "rating", "address", "popularity_rating",
                                             "locality", "price", "landmark", "URL"]], left_index = True, 
                          right_index = True, how = "right")

In [481]:
# Check the null values in the landmark column
print(hotels_df[hotels_df["landmark"].isnull()])

                    city     country  \
7045              Brezno    Slovakia   
7046              Brezno    Slovakia   
7047              Brezno    Slovakia   
7048              Brezno    Slovakia   
7049              Brezno    Slovakia   
7050              Brezno    Slovakia   
7051              Brezno    Slovakia   
7052              Brezno    Slovakia   
7053              Brezno    Slovakia   
9123         Leitariegos       Spain   
9124         Leitariegos       Spain   
9125         Leitariegos       Spain   
9126         Leitariegos       Spain   
9127         Leitariegos       Spain   
9128         Leitariegos       Spain   
9129         Leitariegos       Spain   
10682             Araras      Brazil   
10683             Araras      Brazil   
10684             Araras      Brazil   
10685             Araras      Brazil   
10686             Araras      Brazil   
10687             Araras      Brazil   
10688             Araras      Brazil   
10689             Araras      Brazil   


In [482]:
# Input 9 values manually
# Slovakia
hotels_df.loc[hotels_df.index[7045], "landmark"] = "5.6 miles to Ethnographic Museum"
hotels_df.loc[hotels_df.index[7046], "landmark"] = "4.4 miles to Liptovsky Mara"
hotels_df.loc[hotels_df.index[7047], "landmark"] = "4.1 miles to Chopok"
hotels_df.loc[hotels_df.index[7048], "landmark"] = "5.8 miles to Gothal"
hotels_df.loc[hotels_df.index[7049], "landmark"] = "3.9 Museum of Nature Protection and Speleology"
hotels_df.loc[hotels_df.index[7050], "landmark"] = "3.1 Museum of Nature Protection and Speleology"
hotels_df.loc[hotels_df.index[7051], "landmark"] = "4.9 miles to Namestie Osloboditelov"
hotels_df.loc[hotels_df.index[7052], "landmark"] = "9.0 miles to Wooden church Hronsek UNESCO"
hotels_df.loc[hotels_df.index[7053], "landmark"] = "6.0 miles to Archaeological Museum Havránok"

# Spain
hotels_df.loc[hotels_df.index[9123], "landmark"] = "3.5 miles to Las Rozas Dam"
hotels_df.loc[hotels_df.index[9124], "landmark"] = "13.2 miles to Poza Julia Museum"
hotels_df.loc[hotels_df.index[9125], "landmark"] = "3.2 miles to Muniellos Nature Reserve"
hotels_df.loc[hotels_df.index[9126], "landmark"] = "6.7 miles to Muniellos Nature Reserve"
hotels_df.loc[hotels_df.index[9127], "landmark"] = "7.4 miles to Dominio de Tares"
hotels_df.loc[hotels_df.index[9128], "landmark"] = "7.3 miles to Dominio de Tares"
hotels_df.loc[hotels_df.index[9129], "landmark"] = "9.1 miles to Saliencia Lakes"

# Brazil
hotels_df.loc[hotels_df.index[10682], "landmark"] = "3.4 miles to Itapaiva Castle"
hotels_df.loc[hotels_df.index[10683], "landmark"] = "6.8 miles to Itapaiva Castle"
hotels_df.loc[hotels_df.index[10684], "landmark"] = "1.0 miles to Imperial Museum"
hotels_df.loc[hotels_df.index[10685], "landmark"] = "7.2 miles to Itapaiva Castle"
hotels_df.loc[hotels_df.index[10686], "landmark"] = "1.2 miles to Imperial Museum"
hotels_df.loc[hotels_df.index[10687], "landmark"] = "4.7 miles to Itapaiva Castle"
hotels_df.loc[hotels_df.index[10688], "landmark"] = "8.2 miles to Ferreira da Cunha Museum of Arms"
hotels_df.loc[hotels_df.index[10689], "landmark"] = "2.3 miles to Judith Fountain"

# Germany
hotels_df.loc[hotels_df.index[13272], "landmark"] = "27.6 miles to Ore Mountain Museum"
hotels_df.loc[hotels_df.index[13273], "landmark"] = "5.7 miles to Gottfried Silbermann Museum"
hotels_df.loc[hotels_df.index[13274], "landmark"] = "2.4 miles to Altenberg Bobsleigh"
hotels_df.loc[hotels_df.index[13275], "landmark"] = "3.9 miles to Pferdegöpel auf dem Rudolphschacht"
hotels_df.loc[hotels_df.index[13276], "landmark"] = "6.6 miles to Ore Mountain Museum"
hotels_df.loc[hotels_df.index[13277], "landmark"] = "4.5 miles to Saidenbach Dam"
hotels_df.loc[hotels_df.index[13278], "landmark"] = "3.5 miles to Saidenbach Dam"
hotels_df.loc[hotels_df.index[13279], "landmark"] = "9.1 miles to Wolkenstein Castle"
hotels_df.loc[hotels_df.index[13280], "landmark"] = "4.2 miles to Wolkenstein Castle"
hotels_df.loc[hotels_df.index[13283], "landmark"] = "21.2 miles to Ore Mountain Museum"

# Brazil
hotels_df.loc[hotels_df.index[15161], "landmark"] = "2.8 miles to Santana do Riacho Waterfall"
hotels_df.loc[hotels_df.index[15162], "landmark"] = "13.2 miles to Andorinhas Waterfall"
hotels_df.loc[hotels_df.index[15163], "landmark"] = "4.0 miles to Santana do Riacho Waterfall"
hotels_df.loc[hotels_df.index[15164], "landmark"] = "13.2 miles to Andorinhas Waterfall"
hotels_df.loc[hotels_df.index[15165], "landmark"] = "14.8 miles to Peter Lund Museum"
hotels_df.loc[hotels_df.index[15166], "landmark"] = "2.4 miles to Santana do Riacho Waterfall"

# Sri Lanka
hotels_df.loc[hotels_df.index[15214], "landmark"] = "2.8 miles to Kushtarajagala Statue"
hotels_df.loc[hotels_df.index[15215], "landmark"] = "5.9 miles to Mirissa Beach"
hotels_df.loc[hotels_df.index[15216], "landmark"] = "2.0 miles to Mirissa Beach"
hotels_df.loc[hotels_df.index[15217], "landmark"] = "5.9 miles to Mirissa Beach "
hotels_df.loc[hotels_df.index[15218], "landmark"] = "2.8 miles to Kushtarajagala Statue"
hotels_df.loc[hotels_df.index[15219], "landmark"] = "4.8 miles to Mirissa Beach"

# Russia
hotels_df.loc[hotels_df.index[18137], "landmark"] = "1.2 miles to City Center"
hotels_df.loc[hotels_df.index[18138], "landmark"] = "6.8 miles to City Center"
hotels_df.loc[hotels_df.index[18139], "landmark"] = "4.8 miles to City Center"

# Spain
hotels_df.loc[hotels_df.index[18545], "landmark"] = "0.2 miles to Les Platgetes"
hotels_df.loc[hotels_df.index[18546], "landmark"] = "0.5 miles to Arenal-Bol Beach"
hotels_df.loc[hotels_df.index[18547], "landmark"] = "8.3 miles to Denia Marina"
hotels_df.loc[hotels_df.index[18548], "landmark"] = "6.9 miles to Denia Marina"
hotels_df.loc[hotels_df.index[18549], "landmark"] = "2.4 miles Denia Marina"
hotels_df.loc[hotels_df.index[18550], "landmark"] = "2.1 miles to Mirador Cronistas de Espana"
hotels_df.loc[hotels_df.index[18551], "landmark"] = "2.1 miles to Cova de L'Aigua"
hotels_df.loc[hotels_df.index[18552], "landmark"] = "3.0 miles to Cova de L'Aigua"
hotels_df.loc[hotels_df.index[18553], "landmark"] = "2.7 miles to Albir Beach"
hotels_df.loc[hotels_df.index[18554], "landmark"] = "0.5 miles to Mirador Cronistas de Espana"
hotels_df.loc[hotels_df.index[18555], "landmark"] = "2.0 miles to Albir Beach"

In [483]:
# Check countries again addresses for discrepancies
def func(x):
    return x.country not in x.address
hotels_df["discrepancy"] = hotels_df.apply(func, axis=1)

In [484]:
# Drop discrepancies
hotels_df = hotels_df[~hotels_df.discrepancy]

In [485]:
# Sanity Check
hotels_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15592 entries, 0 to 19819
Data columns (total 11 columns):
city                 15592 non-null object
country              15592 non-null object
hotel_name           15592 non-null object
rating               15592 non-null object
address              15592 non-null object
popularity_rating    15592 non-null object
locality             15592 non-null object
price                15592 non-null object
landmark             15592 non-null object
URL                  15592 non-null object
discrepancy          15592 non-null bool
dtypes: bool(1), object(10)
memory usage: 1.3+ MB


In [486]:
# Replace na values with np.nan
hotels_df["price"] = hotels_df["price"].replace("na", np.nan)

In [487]:
# Convert price to int
f = lambda x: float(x) if x != np.nan else np.nan
hotels_df["price"] = hotels_df["price"].map(f)

In [488]:
# Replace na values with np.nan
hotels_df["rating"] = hotels_df["rating"].replace("na", np.nan)

# Convert rating to float
hotels_df["rating"] = hotels_df["rating"].map(f)

In [489]:
# Replace na values with np.nan
hotels_df["popularity_rating"] = hotels_df["popularity_rating"].replace("na", np.nan)

# Convert rating to float
hotels_df["popularity_rating"] = hotels_df["popularity_rating"].map(f)

In [490]:
# Check for null values again
hotels_df.isna().sum()

city                    0
country                 0
hotel_name              0
rating               4316
address                 0
popularity_rating    4316
locality                0
price                9152
landmark                0
URL                     0
discrepancy             0
dtype: int64

In [491]:
# Convert all landmarks to strings
hotels_df["landmark"] = hotels_df["landmark"].apply(lambda x: str(x))

In [492]:
# Convert all landmarks to lowercase
hotels_df["landmark"] = hotels_df["landmark"].apply(lambda x: x.lower())

In [493]:
# Split landmark
hotels_df["landmark"] = hotels_df["landmark"].str.split("\n")

In [494]:
hotels_df.describe()

Unnamed: 0,rating,popularity_rating,price
count,11276.0,11276.0,6440.0
mean,4.162159,210.298865,131.239752
std,0.592656,237.486363,106.724037
min,1.0,1.0,3.0
25%,4.0,32.0,71.0
50%,4.0,114.0,103.0
75%,4.5,311.0,154.0
max,5.0,998.0,974.0


In [495]:
# Replace np.nan with values from .describe()
hotels_df["price"] = hotels_df["price"].replace(np.nan, 115)
hotels_df["rating"] = hotels_df["rating"].replace(np.nan, 4.0)
hotels_df["popularity_rating"] = hotels_df["popularity_rating"].replace(np.nan, 100)

In [496]:
# Check for null values again
hotels_df.isna().sum()

city                 0
country              0
hotel_name           0
rating               0
address              0
popularity_rating    0
locality             0
price                0
landmark             0
URL                  0
discrepancy          0
dtype: int64

In [497]:
# Prepare city column

# Convert all cities to lowercase
hotels_df["city"] = hotels_df["city"].apply(lambda x: x.lower())

city_list = hotels_df["city"].tolist()

# Find set of unique cities and convert to a list
unique_cities = list(set(city_list))

# Create indexes for each city
city_index = {city: idx for idx, city in enumerate(unique_cities)}
index_city = {idx: city for city, idx in city_index.items()}

In [498]:
# Prepare country column

# Convert all countries to lowercase
hotels_df["country"] = hotels_df["country"].apply(lambda x: x.lower())

country_list = hotels_df["country"].tolist()

# Find set of unique countries and convert to a list
unique_countries = list(set(country_list))

# Create indexes for each property
country_index = {country: idx for idx, country in enumerate(unique_countries)}
index_country = {idx: country for country, idx in country_index.items()}

In [499]:
# Prepare hotel_name column

# Convert all hotels to lowercase
hotels_df["hotel_name"] = hotels_df["hotel_name"].apply(lambda x: x.lower())

# Create hotel names list
hotels_list = hotels_df["hotel_name"].tolist()

# Unique hotels
unique_hotels = list(set(hotels_list))

# Create indexes for each hotel
hotel_index = {hotel: idx for idx, hotel in enumerate(unique_hotels)}
index_hotel = {idx: hotel for hotel, idx in hotel_index.items()}

In [500]:
# Create ratings list
rating_list = hotels_df["rating"].tolist()

# Find set of unique ratings and convert to a list
unique_ratings = list(set(rating_list))

# Create indexes for each rating
rating_index = {rating: idx for idx, rating in enumerate(unique_ratings)}
index_rating = {idx: rating for rating, idx in rating_index.items()}

In [501]:
# Create popularity ratings list
popularity_list = hotels_df["popularity_rating"].tolist()

# Find set of unique ratings and convert to a list
unique_popularity = list(set(popularity_list))

# Create indexes for each rating
popularity_index = {popularity: idx for idx, popularity in enumerate(unique_popularity)}
index_popularity = {idx: popularity for popularity, idx in popularity_index.items()}

In [502]:
# Prepare locality column

# Convert all hotels to lowercase
hotels_df["locality"] = hotels_df["locality"].apply(lambda x: x.lower())

# Create hotel names list
locality_list = hotels_df["locality"].tolist()
unique_localities = list(set(locality_list))

# Create indexes for each hotel
locality_index = {locality: idx for idx, locality in enumerate(unique_localities)}
index_locality = {idx: locality for locality, idx in locality_index.items()}

In [503]:
# Create price list
price_list = hotels_df["price"].tolist()

# Unique prices
unique_prices = list(set(price_list))

# Create indexes for each price
price_index = {price: idx for idx, price in enumerate(unique_prices)}
index_price = {idx: price for price, idx in price_index.items()}

In [504]:
# Prepare locality column

# Create hotel names list
landmark_list = hotels_df["landmark"].tolist()

# Find set of unique properties and convert to a list
unique_landmarks = list(chain(*[list(set(landmarks)) for landmarks in landmark_list]))
unique_landmarks = list(set(unique_landmarks))

# Create indexes for each hotel
landmark_index = {landmark: idx for idx, landmark in enumerate(unique_landmarks)}
index_landmark = {idx: landmark for landmark, idx in landmark_index.items()}

In [505]:
# Build tuples to train embedding neural network
hotel_tuples = []

# Iterate through each row of dataframe
for index, row in hotels_df.iterrows():
    # Iterate through the properties in the item
    hotel_tuples.extend((city_index[hotels_df.at[index, "city"]], country_index[hotels_df.at[index, "country"]],
                         hotel_index[hotels_df.at[index, "hotel_name"]], rating_index[hotels_df.at[index, "rating"]],
                         popularity_index[hotels_df.at[index, "popularity_rating"]], 
                         locality_index[hotels_df.at[index, "locality"]], 
                         price_index[hotels_df.at[index, "price"]], landmark_index[landmark]) for landmark 
                        in hotels_df.at[index, "landmark"] if landmark.lower() in unique_landmarks)


In [506]:
# Generator for training samples
def generate_batch(tuples, n_positive = 75, negative_ratio = 2.0):
    
    pairs_set = set(tuples)
    
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 9))
    
    # Label for negative examples
    neg_label = 0
    
    # This creates a generator
    while True:
        # randomly choose positive examples
        for idx, (city_id, country_id, hotel_id, rating_id, popularity_id, locality_id, price_id, landmark_id) in enumerate(random.sample(tuples, n_positive)):
            batch[idx, :] = (city_id, country_id, hotel_id, rating_id, popularity_id, locality_id, price_id, landmark_id, 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples until reach batch size
        while idx < batch_size:
            
            # random selection
            random_city = random.randrange(len(unique_cities))
            random_country = random.randrange(len(unique_countries))
            random_hotel = random.randrange(len(unique_hotels))
            random_rating = random.randrange(len(unique_ratings))
            random_popularity = random.randrange(len(unique_popularity))
            random_locality = random.randrange(len(unique_localities))
            random_price = random.randrange(len(unique_prices))
            random_landmark = random.randrange(len(unique_landmarks))
            
            # Check to make sure this is not a positive example
            if (random_city, random_country, random_hotel, random_rating, random_popularity, random_locality,
               random_price, random_landmark) not in pairs_set:
                
                # Add to batch and increment index
                batch[idx, :] = (random_city, random_country, random_hotel, random_rating, random_popularity, random_locality,
                                   random_price, random_landmark, neg_label)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {"city": batch[:, 0], "country": batch[:, 1], "hotel": batch[:, 2], 
               "rating": batch[:, 3], "popularity": batch[:, 4], "locality": batch[:, 5], 
               "price": batch[:, 6], "landmark": batch[:, 7]}, batch[:, 8]

In [507]:
# Properties embedding model
def hotel_embeddings(embedding_size = 100):
    
    # Inputs are one-dimensional
    city = Input(name = "city", shape = [1])
    country = Input(name = "country", shape = [1])
    hotel = Input(name = "hotel", shape = [1])
    rating = Input(name = "rating", shape = [1])
    popularity = Input(name = "popularity", shape = [1])
    locality = Input(name = "locality", shape = [1])
    price = Input(name = "price", shape = [1])
    landmark = Input(name = "landmark", shape = [1])
    
    # Embedding the city
    city_embedding = Embedding(name = "city_embedding", input_dim = len(city_index), 
                              output_dim = embedding_size)(city)
    
    # Embedding the country
    country_embedding = Embedding(name = "country_embedding", input_dim = len(country_index),
                                  output_dim = embedding_size)(country)
    
    # Embedding the hotel
    hotel_embedding = Embedding(name = "hotel_embedding", input_dim = len(hotel_index),
                                  output_dim = embedding_size)(hotel)
    
    # Embedding the rating
    rating_embedding = Embedding(name = "rating_embedding", input_dim = len(rating_index),
                                  output_dim = embedding_size)(rating)
    
    # Embedding the popularity
    popularity_embedding = Embedding(name = "popularity_embedding", input_dim = len(popularity_index),
                                  output_dim = embedding_size)(popularity)    
    
    # Embedding the locality
    locality_embedding = Embedding(name = "locality_embedding", input_dim = len(locality_index),
                                  output_dim = embedding_size)(locality)
    
    # Embedding the price
    price_embedding = Embedding(name = "price_embedding", input_dim = len(price_index),
                                  output_dim = embedding_size)(price)
    
    # Embedding the landmark
    landmark_embedding = Embedding(name = "landmark_embedding", input_dim = len(landmark_index),
                                  output_dim = embedding_size)(landmark)
    
    
    # Merge the embeddings with multiplication 
    merged_one = Multiply(name = "interaction_one")([city_embedding, country_embedding])
    merged_two = Multiply(name = "interaction_two")([city_embedding, hotel_embedding])
    merged_three = Multiply(name = "interaction_three")([city_embedding, rating_embedding])
    merged_four = Multiply(name = "interaction_four")([city_embedding, locality_embedding])
    merged_five = Multiply(name = "interaction_five")([city_embedding, price_embedding])
    merged_six = Multiply(name = "interaction_six")([city_embedding, landmark_embedding])
    merged_seven = Multiply(name = "interaction_seven")([country_embedding, hotel_embedding])
    merged_eight = Multiply(name = "interaction_eight")([country_embedding, rating_embedding])
    merged_nine = Multiply(name = "interaction_nine")([country_embedding, locality_embedding])
    merged_ten = Multiply(name = "interaction_ten")([country_embedding, price_embedding])
    merged_eleven = Multiply(name = "interaction_eleven")([country_embedding, landmark_embedding])
    merged_twelve = Multiply(name = "interaction_twelve")([hotel_embedding, rating_embedding])  
    merged_thirteen = Multiply(name = "interaction_thirteen")([hotel_embedding, locality_embedding])  
    merged_fourteen = Multiply(name = "interaction_fourteen")([hotel_embedding, price_embedding])  
    merged_fifteen = Multiply(name = "interaction_fifteen")([hotel_embedding, landmark_embedding])  
    merged_sixteen = Multiply(name = "interaction_sixteen")([rating_embedding, locality_embedding])  
    merged_seventeen = Multiply(name = "interaction_seventeen")([rating_embedding, price_embedding])  
    merged_eighteen = Multiply(name = "interaction_eighteen")([rating_embedding, landmark_embedding])  
    merged_nineteen = Multiply(name = "interaction_nineteen")([locality_embedding, price_embedding])  
    merged_twenty = Multiply(name = "interaction_twenty")([locality_embedding, landmark_embedding])
    merged_twentyone = Multiply(name = "interaction_twentyone")([price_embedding, landmark_embedding])
    merged_twentytwo = Multiply(name = "interaction_twentytwo")([popularity_embedding, city_embedding])
    merged_twentythree = Multiply(name = "interaction_twentythree")([popularity_embedding, country_embedding])
    merged_twentyfour = Multiply(name = "interaction_twentyfour")([popularity_embedding, hotel_embedding])
    merged_twentyfive = Multiply(name = "interaction_twentyfive")([popularity_embedding, rating_embedding])
    merged_twentysix = Multiply(name = "interaction_twentysix")([popularity_embedding, locality_embedding])
    merged_twentyseven = Multiply(name = "interaction_twentyseven")([popularity_embedding, price_embedding])
    merged_twentyeight = Multiply(name = "interaction_twentyeight")([popularity_embedding, landmark_embedding])
    
    # GlobalMaxPool
    pooling_one = GlobalMaxPool1D(name = "pooling_one")(merged_one)
    pooling_two = GlobalMaxPool1D(name = "pooling_two")(merged_two)
    pooling_three = GlobalMaxPool1D(name = "pooling_three")(merged_three)
    pooling_four = GlobalMaxPool1D(name = "pooling_four")(merged_four)
    pooling_five = GlobalMaxPool1D(name = "pooling_five")(merged_five)
    pooling_six = GlobalMaxPool1D(name = "pooling_six")(merged_six)    
    pooling_seven = GlobalMaxPool1D(name = "pooling_seven")(merged_seven)
    pooling_eight = GlobalMaxPool1D(name = "pooling_eight")(merged_eight)   
    pooling_nine = GlobalMaxPool1D(name = "pooling_nine")(merged_nine)   
    pooling_ten = GlobalMaxPool1D(name = "pooling_ten")(merged_ten)    
    pooling_eleven = GlobalMaxPool1D(name = "pooling_eleven")(merged_eleven)    
    pooling_twelve = GlobalMaxPool1D(name = "pooling_twelve")(merged_twelve)  
    pooling_thirteen = GlobalMaxPool1D(name = "pooling_thirteen")(merged_thirteen)
    pooling_fourteen = GlobalMaxPool1D(name = "pooling_fourteen")(merged_fourteen)
    pooling_fifteen = GlobalMaxPool1D(name = "pooling_fifteen")(merged_fifteen)
    pooling_sixteen = GlobalMaxPool1D(name = "pooling_sixteen")(merged_sixteen)
    pooling_seventeen = GlobalMaxPool1D(name = "pooling_seventeen")(merged_seventeen)
    pooling_eighteen = GlobalMaxPool1D(name = "pooling_eighteen")(merged_eighteen)
    pooling_nineteen = GlobalMaxPool1D(name = "pooling_nineteen")(merged_nineteen)
    pooling_twenty = GlobalMaxPool1D(name = "pooling_twenty")(merged_twenty)
    pooling_twentyone = GlobalMaxPool1D(name = "pooling_twentyone")(merged_twentyone)
    pooling_twentytwo = GlobalMaxPool1D(name = "pooling_twentytwo")(merged_twentytwo)
    pooling_twentythree = GlobalMaxPool1D(name = "pooling_twentythree")(merged_twentythree)
    pooling_twentyfour = GlobalMaxPool1D(name = "pooling_twentyfour")(merged_twentyfour)
    pooling_twentyfive = GlobalMaxPool1D(name = "pooling_twentyfive")(merged_twentyfive)
    pooling_twentysix = GlobalMaxPool1D(name = "pooling_twentysix")(merged_twentysix)
    pooling_twentyseven = GlobalMaxPool1D(name = "pooling_twentyseven")(merged_twentyseven)
    pooling_twentyeight = GlobalMaxPool1D(name = "pooling_twentyeight")(merged_twentyeight)
    
    # Interaction gate
    sum_interaction = Add(name = "interaction_gate")([pooling_one, pooling_two, pooling_three, pooling_four, 
                                                       pooling_five, pooling_six, pooling_seven, pooling_eight, 
                                                       pooling_nine, pooling_ten, pooling_eleven, pooling_twelve,
                                                       pooling_thirteen, pooling_fourteen, pooling_fifteen, 
                                                        pooling_sixteen, pooling_seventeen, pooling_eighteen, 
                                                        pooling_nineteen, pooling_twenty, pooling_twentyone, 
                                                        pooling_twentytwo, pooling_twentythree, pooling_twentyfour, 
                                                       pooling_twentyfive, pooling_twentysix, pooling_twentyseven, 
                                                        pooling_twentyeight])
    
    # Fully connected layer 
    merged = Dense(1, activation = "sigmoid")(sum_interaction)
    model = Model(inputs = [city, country, hotel, rating, popularity,
                           locality, price, landmark], outputs = merged)
    model.compile(optimizer = "Adadelta", loss = "binary_crossentropy", metrics = ["accuracy"])
    
    return model

# Instantiate model and show parameters
model = hotel_embeddings()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
city (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
country (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
hotel (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
rating (InputLayer)             (None, 1)            0                                            
__________________________________________________________________________________________________
locality (

In [508]:
# Train Model

n_positive = 2000

# Create generator 
generator = generate_batch(hotel_tuples, n_positive, negative_ratio = 1)

# Train

item_property = model.fit_generator(generator, epochs = 100, 
                                    steps_per_epoch = len(hotel_tuples) // n_positive, verbose = 2)

Epoch 1/100
 - 7s - loss: 0.6922 - acc: 0.5885
Epoch 2/100
 - 1s - loss: 0.6917 - acc: 0.5838
Epoch 3/100
 - 2s - loss: 0.6909 - acc: 0.5883
Epoch 4/100
 - 1s - loss: 0.6893 - acc: 0.6105
Epoch 5/100
 - 1s - loss: 0.6881 - acc: 0.6060
Epoch 6/100
 - 1s - loss: 0.6883 - acc: 0.5734
Epoch 7/100
 - 1s - loss: 0.6857 - acc: 0.5942
Epoch 8/100
 - 1s - loss: 0.6846 - acc: 0.5958
Epoch 9/100
 - 1s - loss: 0.6851 - acc: 0.5457
Epoch 10/100
 - 1s - loss: 0.6792 - acc: 0.6023
Epoch 11/100
 - 1s - loss: 0.6729 - acc: 0.6303
Epoch 12/100
 - 1s - loss: 0.6628 - acc: 0.6573
Epoch 13/100
 - 1s - loss: 0.6516 - acc: 0.6421
Epoch 14/100
 - 1s - loss: 0.6386 - acc: 0.6013
Epoch 15/100
 - 1s - loss: 0.6161 - acc: 0.6077
Epoch 16/100
 - 1s - loss: 0.5999 - acc: 0.6266
Epoch 17/100
 - 1s - loss: 0.5910 - acc: 0.6512
Epoch 18/100
 - 2s - loss: 0.5820 - acc: 0.5824
Epoch 19/100
 - 1s - loss: 0.5420 - acc: 0.6348
Epoch 20/100
 - 1s - loss: 0.5167 - acc: 0.6627
Epoch 21/100
 - 1s - loss: 0.5480 - acc: 0.6737
E

In [None]:
# Save model
model.save("../models/embeddings_fourth_attempt.h5")
model.save_weights("../models/embeddings_fourth_attempt_weights.h5")