In [105]:
# Import libraries
from collections import Counter, OrderedDict
import datawig
from itertools import chain
from keras.layers import Input, Embedding, Dot, Reshape, Dense, Concatenate, Multiply
from keras.models import Model
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 500)
pd.options.display.max_colwidth = 1000
import random

ModuleNotFoundError: No module named 'datawig'

In [82]:
# Path to file
hotels_path = "../data/hotels_com_scrape_v2.csv"

# Dataframe
hotels_df = pd.read_csv(hotels_path, header = None)

# Sanity check
hotels_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,"Masserberg, Germany",Hotel Rennsteig,3.5,"Am Badehaus 1, Masserberg, TH, 98666, Germany",Masserberg,"1.1 miles to City center, 0.4 miles to Skilift-Masserberg",83.0,https://www.hotels.com/ho343089/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=1&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
1,"Masserberg, Germany",Hotel Kammweg,3.0,"Ehringshäuser Str. 4, Neustadt am Rennsteig, TH, 98701, Germany",Neustadt am Rennsteig,"5.0 miles to Masserberg, 4.4 miles to Skilift-Masserberg",63.0,https://www.hotels.com/ho265175/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=2&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
2,"Masserberg, Germany",Landferienhaus Linde,5.0,"Strasse am Berg 10, Schleusegrund, Thüringen, 98667, Germany",Schleusegrund,"4.2 miles to Masserberg, 5.2 miles to Skilift-Masserberg",na,https://www.hotels.com/ho1067943424/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=3&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
3,"Masserberg, Germany",Hotel und Landgasthaus Zur grünen Erle,na,"Erlauer Hauptstraße 69, Sankt Kilian, TH, 98553, Germany",Sankt Kilian,"8.7 miles to Masserberg, 9.6 miles to Skilift-Masserberg",74.0,https://www.hotels.com/ho1157676896/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=4&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
4,"Masserberg, Germany",Berghotel Stutenhaus,3.5,"Stutenhausstraße 1, Suhl, TH, 98711, Germany",Suhl,"8.9 miles to Masserberg, 9.3 miles to Skilift-Masserberg",na,https://www.hotels.com/ho923498720/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=5&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3


In [83]:
# Rename columns
hotels_df.rename({0: "location", 1: "hotel_name", 2: "rating", 
                      3: "address", 4: "locality", 5: "landmark", 
                      6: "price", 7: "URL"}, inplace = True, axis = 1)

In [84]:
# Replace na values with np.nan
hotels_df["price"] = hotels_df["price"].replace("na", np.nan)

In [85]:
# Convert price to int
f = lambda x: float(x) if x != np.nan else np.nan
hotels_df["price"] = hotels_df["price"].map(f)

In [86]:
# Replace na values with np.nan
hotels_df["rating"] = hotels_df["rating"].replace("na", np.nan)

# Convert rating to float
hotels_df["rating"] = hotels_df["rating"].map(f)

In [88]:
# Convert all landmarks to strings
hotels_df["landmark"] = hotels_df["landmark"].apply(lambda x: str(x))

In [89]:
# Convert all landmarks to lowercase
hotels_df["landmark"] = hotels_df["landmark"].apply(lambda x: x.lower())

In [90]:
# Split landmark
hotels_df["landmark"] = hotels_df["landmark"].str.split("\n")

In [91]:
# Split location
location_df = hotels_df["location"].str.split(",", expand = True)

In [92]:
# Rename landmark columns
location_df.rename({0: "city", 1: "country"}, inplace = True, axis = 1)

In [93]:
location_df.head()

Unnamed: 0,city,country
0,Masserberg,Germany
1,Masserberg,Germany
2,Masserberg,Germany
3,Masserberg,Germany
4,Masserberg,Germany


In [94]:
# Merge landmarks_df with hotels_df
hotels_df = pd.merge(location_df, hotels_df[["hotel_name", "rating", "address", 
                                             "locality", "price", "landmark", "URL"]], left_index = True, 
                          right_index = True, how = "right")

In [95]:
# Sanity Check
hotels_df.head(100)

Unnamed: 0,city,country,hotel_name,rating,address,locality,price,landmark,URL
0,Masserberg,Germany,Hotel Rennsteig,3.5,"Am Badehaus 1, Masserberg, TH, 98666, Germany",Masserberg,83.0,"[1.1 miles to city center, 0.4 miles to skilift-masserberg]",https://www.hotels.com/ho343089/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=1&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
1,Masserberg,Germany,Hotel Kammweg,3.0,"Ehringshäuser Str. 4, Neustadt am Rennsteig, TH, 98701, Germany",Neustadt am Rennsteig,63.0,"[5.0 miles to masserberg, 4.4 miles to skilift-masserberg]",https://www.hotels.com/ho265175/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=2&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
2,Masserberg,Germany,Landferienhaus Linde,5.0,"Strasse am Berg 10, Schleusegrund, Thüringen, 98667, Germany",Schleusegrund,,"[4.2 miles to masserberg, 5.2 miles to skilift-masserberg]",https://www.hotels.com/ho1067943424/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=3&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
3,Masserberg,Germany,Hotel und Landgasthaus Zur grünen Erle,,"Erlauer Hauptstraße 69, Sankt Kilian, TH, 98553, Germany",Sankt Kilian,74.0,"[8.7 miles to masserberg, 9.6 miles to skilift-masserberg]",https://www.hotels.com/ho1157676896/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=4&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
4,Masserberg,Germany,Berghotel Stutenhaus,3.5,"Stutenhausstraße 1, Suhl, TH, 98711, Germany",Suhl,,"[8.9 miles to masserberg, 9.3 miles to skilift-masserberg]",https://www.hotels.com/ho923498720/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=5&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
5,Masserberg,Germany,BERG & SPA HOTEL GABELBACH,4.0,"Am Gabelbach 1, Ilmenau, 98693, Germany",Ilmenau,123.0,"[10 miles to masserberg, 10 miles to skilift-masserberg]",https://www.hotels.com/ho82495104/?q-check-out=2020-08-29&FPQ=3&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=6&tab=description&JHR=3&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
6,Masserberg,Germany,Hotel Schöne Aussicht,4.0,"Steinbächlein 45, Steinach, 96523, Germany",Steinach,80.0,"[11 miles to masserberg, 10 miles to skilift-masserberg]",https://www.hotels.com/ho806571904/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=7&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
7,Masserberg,Germany,Gasthaus Goldener Hirsch,4.5,"An der Hasel 91-93, Suhl, 98527, Germany",Suhl,106.0,"[13 miles to masserberg, 13 miles to skilift-masserberg]",https://www.hotels.com/ho80825504/?q-check-out=2020-08-29&FPQ=3&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=8&tab=description&JHR=3&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
8,Masserberg,Germany,Michel Hotel Suhl,4.0,"Platz der Deutschen Einheit 2, Suhl, Thüringen, 98527, Germany",Suhl,,"[13 miles to masserberg, 14 miles to skilift-masserberg]",https://www.hotels.com/ho666446/?q-check-out=2020-08-29&FPQ=4&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=9&tab=description&JHR=5&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
9,Masserberg,Germany,Pension Zum Glasmacher,5.0,"Glasmacherstrasse 8, Gehlberg, Thüringen, 98559, Germany",Gehlberg,61.0,"[13 miles to masserberg, 13 miles to skilift-masserberg]",https://www.hotels.com/ho600906528/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=10&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3


In [96]:
# Prepare city column

# Convert all cities to lowercase
hotels_df["city"] = hotels_df["city"].apply(lambda x: x.lower())

city_list = hotels_df["city"].tolist()

# Find set of unique cities and convert to a list
unique_cities = list(set(city_list))

# Create indexes for each city
city_index = {city: idx for idx, city in enumerate(unique_cities)}
index_city = {idx: city for city, idx in city_index.items()}

In [97]:
# Prepare country column

# Convert all countries to lowercase
hotels_df["country"] = hotels_df["country"].apply(lambda x: x.lower())

country_list = hotels_df["country"].tolist()

# Find set of unique countries and convert to a list
unique_countries = list(set(country_list))

# Create indexes for each property
country_index = {country: idx for idx, country in enumerate(unique_countries)}
index_country = {idx: country for country, idx in country_index.items()}

In [98]:
# Prepare hotel_name column

# Convert all hotels to lowercase
hotels_df["hotel_name"] = hotels_df["hotel_name"].apply(lambda x: x.lower())

# Create hotel names list
hotels_list = hotels_df["hotel_name"].tolist()

# Create indexes for each hotel
hotel_index = {hotel: idx for idx, hotel in enumerate(hotels_list)}
index_hotel = {idx: hotel for hotel, idx in hotel_index.items()}

In [99]:
# Create ratings list
rating_list = hotels_df["rating"].tolist()

# Find set of unique ratings and convert to a list
unique_ratings = list(set(rating_list))

# Create indexes for each rating
rating_index = {rating: idx for idx, rating in enumerate(rating_list)}
index_rating = {idx: rating for rating, idx in price_index.items()}

In [100]:
# Create ratings list
rating_list = hotels_df["rating"].tolist()

# Find set of unique ratings and convert to a list
unique_ratings = list(set(rating_list))

# Create indexes for each rating
rating_index = {rating: idx for idx, rating in enumerate(rating_list)}
index_rating = {idx: rating for rating, idx in rating_index.items()}

In [101]:
# Prepare locality column

# Convert all hotels to lowercase
hotels_df["locality"] = hotels_df["locality"].apply(lambda x: x.lower())

# Create hotel names list
locality_list = hotels_df["locality"].tolist()
unique_localities = list(set(locality_list))

# Create indexes for each hotel
locality_index = {locality: idx for idx, locality in enumerate(unique_localities)}
index_locality = {idx: locality for locality, idx in locality_index.items()}

In [102]:
# Create price list
price_list = hotels_df["price"].tolist()

# Create indexes for each price
price_index = {price: idx for idx, price in enumerate(price_list)}
index_price = {idx: price for price, idx in price_index.items()}

In [104]:
# Prepare locality column

# Create hotel names list
landmark_list = hotels_df["landmark"].tolist()

# Find set of unique properties and convert to a list
unique_landmarks = list(chain(*[list(set(landmarks)) for landmarks in landmark_list]))
unique_landmarks = list(set(unique_landmarks))

# Create indexes for each hotel
landmark_index = {landmark: idx for idx, landmark in enumerate(unique_landmarks)}
index_landmark = {idx: landmark for landmark, idx in landmark_index.items()}

In [109]:
# Build tuples to train embedding neural network
hotel_tuples = []

# Iterate through each row of dataframe
for index, row in hotels_df.iterrows():
    # Iterate through the properties in the item
    hotel_tuples.extend((city_index[hotels_df.at[index, "city"]], country_index[hotels_df.at[index, "country"]],
                         hotel_index[hotels_df.at[index, "hotel_name"]], rating_index[hotels_df.at[index, "rating"]],
                         locality_index[hotels_df.at[index, "locality"]], price_index[hotels_df.at[index, "price"]],
                         landmark_index[landmark]) for landmark in hotels_df.at[index, "landmark"] 
                         if landmark.lower() in unique_landmarks)

KeyError: nan

In [None]:
# Generator for training samples
def generate_batch(tuples, n_positive = 75, negative_ratio = 2.0):
    
    pairs_set = set(tuples)
    
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 5))
    
    # Label for negative examples
    neg_label = 0
    
    # This creates a generator
    while True:
        # randomly choose positive examples
        for idx, (item_id, property_id, price_id, location_id) in enumerate(random.sample(tuples, n_positive)):
            batch[idx, :] = (item_id, property_id, price_id, location_id, 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples until reach batch size
        while idx < batch_size:
            
            # random selection
            random_item = random.randrange(len(item_list))
            random_property = random.randrange(len(unique_properties))
            random_price = random.randrange(len(price_list))
            random_location = random.randrange(len(unique_locations))
            
            # Check to make sure this is not a positive example
            if (random_item, random_property, random_price, random_location) not in pairs_set:
                
                # Add to batch and increment index
                batch[idx, :] = (random_item, random_property, random_price, random_location, neg_label)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {"item": batch[:, 0], "property": batch[:, 1], "price": batch[:, 2], 
               "location": batch[:, 3]}, batch[:, 4]

In [None]:
# Create embeddings

# TSNE

# Calculate missing values for price, and ratings

# Re create embeddings

# Regression and classification?
