In [1]:
# Load hotels

In [2]:
# For loop goign through each hotel, checking if it has a match in traffic dataset,
# then creates a hash of the url, then checks if hash and hotel id are already in downloaded
# file, then calls download image function, then saves images the to downloaded file

In [3]:
# NEED FUNCTIONS FOR: 
# 1. checking if it has a match in traffic dataset
# 2. creating a hash of the url
# 3. checking if the hash and hotel id are already downloaded in file
# 4. downloading the images
# 5. saves the image to the download file

In [54]:
import numpy as np
import pandas as pd
import os
import hashlib
from scipy.spatial.distance import cdist
from rapidfuzz import fuzz, utils
import json
import requests
from urllib.parse import urlparse
import random

In [55]:
# Splits the gps column in the booking.com data
def split_gps_column(sharded_data):
    if 'gps' in sharded_data.columns:
        # Split the 'gps' column on the delimiter (assuming comma, adjust if necessary)
        sharded_data[['latitude', 'longitude']] = sharded_data['gps'].str.split(',', expand=True)
        
        # Convert latitude and longitude to numeric types
        sharded_data['latitude'] = pd.to_numeric(sharded_data['latitude'], errors='coerce')
        sharded_data['longitude'] = pd.to_numeric(sharded_data['longitude'], errors='coerce')

        # Drop the 'gps' column after splitting
        sharded_data.drop('gps', axis=1, inplace=True)
    
    return sharded_data

In [57]:
def remove_after_near(s):
    index = s.lower().find("near")
    return s[:index].strip() if index != -1 else s

In [58]:
# Load the TraffickCam data
def loadTraffickCam(path):
    with open(path, 'r', encoding='utf-8') as file:  # Specify utf-8 encoding
        txt_data = file.readlines()
    
    txt_data = [line.strip().split('\t') for line in txt_data]

    columns = ['number', 'hotel_name', 'latitude', 'longitude', 'places_id']
    txt_data_df = pd.DataFrame(txt_data, columns=columns)

    txt_data_df['longitude'] = pd.to_numeric(txt_data_df['longitude'], errors='coerce')
    txt_data_df['latitude'] = pd.to_numeric(txt_data_df['latitude'], errors='coerce')
    txt_data_df.dropna(subset=['longitude', 'latitude'], inplace=True)

    return txt_data_df

In [None]:
# Load the sharded booking.com data
def load_sharded_dataset(root_dir):
    all_data = []
    print("after making list")
    try:
        print("after try")
        for root, dirs, files in os.walk(root_dir):
            #print("in for loop")
            #print(f"Scanning directory: {root}")  # Log current directory being scanned
            
            if not files:
                print(f"No files found in directory: {root}")
                continue  # Skip directories with no files
            
            for file in files:
                file_path = os.path.join(root, file)
                #print(f"Attempting to load file: {file_path}")  # Log which file is being loaded
                
                # Check file extension and load accordingly
                if file.endswith('.csv'):
                    try:
                        data = pd.read_csv(file_path)
                        all_data.append(data)
                        #print(f"Successfully loaded {file_path} with {len(data)} records.")
                    except Exception as e:
                        print(f"Error loading {file_path}: {e}")
                elif file.endswith('.json'):
                    try:
                        data = pd.read_json(file_path)
                        all_data.append(data)
                        #print(f"Successfully loaded {file_path} with {len(data)} records.")
                    except Exception as e:
                        print(f"Error loading {file_path}: {e}")
                else:
                    print(f"Skipping unsupported file format: {file_path}")
    
    except Exception as e:
        print(f"Exception occurred during directory traversal: {e}")
    
    # Ensure we're returning a single combined DataFrame
    if all_data:
        combined_data = pd.concat(all_data, ignore_index=True)

        if 'images' not in combined_data.columns:
            combined_data['images'] = [[] for _ in range(len(combined_data))]

        combined_data = combined_data.groupby(
            ["hotel_name", "address", "gps", "rating", "url"], dropna=False
        )["images"].agg(lambda x: sum(x, []) if isinstance(x.iloc[0], list) else list(x)).reset_index()

        combined_data["hotel_name"] = combined_data["hotel_name"].str.replace('-', ' ')

        print(f"Loaded {len(all_data)} files, combined into one DataFrame.")

        split_gps_column(combined_data)
        return combined_data
    else:
        print("No data loaded.")
        return None
    
    

In [60]:
import hashlib

# Creates the has of the booking.com url
def hash_url(url):
    b_url = str.encode(str(url)) # turns the url into bytes in order to be hashed
    hash = hashlib.md5(b_url).hexdigest() # Hashes the byte-form url using md5 algorithm
    
    return hash

In [61]:
# Clean hotel names
def clean_hotel_names(booking_df, traffic_df):
    bookinglatlng = np.vstack([booking_df["latitude"].to_numpy(), booking_df["longitude"].to_numpy()]).T
    bookingHotelNames = booking_df["hotel_name"].fillna("").to_numpy().astype(str)
    bookingHotelURL = booking_df["url"].fillna("").to_numpy().astype(str)
    bookingImages = booking_df["images"].to_numpy()
    
    trafficlatlng = np.vstack([traffic_df["latitude"].to_numpy(), traffic_df["longitude"].to_numpy()]).T
    trafficHotelNames = traffic_df["hotel_name"].fillna("").to_numpy().astype(str)
    trafficHotelID = traffic_df["places_id"].fillna("").to_numpy().astype(str)

    # List of phrases to omit
    phrases_to_omit = ["hotel", "motel", "bed and breakfast", "inn ", "&", "suites", " and", " by", " or", 
                    " the", " an", "ihg", " at", " area", " north", " south", " east", "west "]

    # Convert to lowercase
    cleanedBookingHotelNames = np.char.lower(bookingHotelNames)
    cleanedTrafficHotelNames = np.char.lower(trafficHotelNames)

    # Remove punctuation
    unwanted_chars = "()-.,!?"
    translation_table = str.maketrans('', '', unwanted_chars)
    cleanedBookingHotelNames = np.char.translate(cleanedBookingHotelNames, translation_table)
    cleanedTrafficHotelNames = np.char.translate(cleanedTrafficHotelNames, translation_table)

    # Remove phrases
    for phrase in phrases_to_omit:
        cleanedBookingHotelNames = np.char.replace(cleanedBookingHotelNames, phrase, "")
        cleanedTrafficHotelNames = np.char.replace(cleanedTrafficHotelNames, phrase, "")

    # Remove double spaces
    cleanedBookingHotelNames = np.char.replace(cleanedBookingHotelNames, '  ', ' ')
    cleanedTrafficHotelNames = np.char.replace(cleanedTrafficHotelNames, '  ', ' ')

    # Remove words after "near"
    remove_after_near_vec = np.vectorize(remove_after_near)
    cleanedBookingHotelNames = remove_after_near_vec(cleanedBookingHotelNames)
    cleanedTrafficHotelNames = remove_after_near_vec(cleanedTrafficHotelNames)

    # Strip whitespace
    cleanedBookingHotelNames = np.char.strip(cleanedBookingHotelNames)
    cleanedTrafficHotelNames = np.char.strip(cleanedTrafficHotelNames)

    print(type(cleanedBookingHotelNames))

    # Create final DataFrames
    booking_df_final = pd.DataFrame({
        "index": np.arange(len(cleanedBookingHotelNames)),
        "cleanedHotelName": cleanedBookingHotelNames,
        "latitude": bookinglatlng[:, 0],
        "longitude": bookinglatlng[:, 1],
        "url": bookingHotelURL,
        "images": bookingImages
    })

    traffic_df_final = pd.DataFrame({
        "cleanedHotelName": cleanedTrafficHotelNames,
        "latitude": trafficlatlng[:, 0],
        "longitude": trafficlatlng[:, 1],
        "places_id": trafficHotelID
    })

    return booking_df_final, traffic_df_final


In [62]:
def find_distances(booking_df, traffic_df):
    # Initialize arrays to store the 10 closest indices and distances for each point

    latlon_small = np.vstack([np.array(booking_df["latitude"]), np.array(booking_df["longitude"])]).T # Now (3906, 2)
    latlon_large = np.vstack([np.array(traffic_df["latitude"]), np.array(traffic_df["longitude"])]).T  # Now (625138, 2)
    # Compute the pairwise Euclidean distances

    #hotel_name_small = bookingHotelNames.T
    hotel_name_small = np.vstack([np.array(booking_df["cleanedHotelName"])]).T
    hotel_name_large = np.vstack([np.array(traffic_df["cleanedHotelName"])]).T

    booking_urls = np.vstack([np.array(booking_df["url"])]).T
    traffic_places_id = np.vstack([np.array(traffic_df["places_id"])]).T

    unique_latlon_large = np.unique(latlon_large, axis=0)

    num_small = latlon_small.shape[0]

    closest_indices = []
    closest_distances = []
    similarity_scores = []

    distance_threshold = 0.0045

    # Loop through each point in the smaller array
    for i in range(num_small):
        # Compute distances between the ith point in latlon_small and all points in latlon_large
        distances = cdist(latlon_small[i:i+1], latlon_large).flatten()  # distances shape (625138,)

        # Filter indices where the distance is within 500 meters (threshold)
        within_threshold_idx = np.where(distances <= distance_threshold)[0]
        
        # Store the closest indices and corresponding distances
        closest_indices.append(within_threshold_idx)
        closest_distances.append(distances[within_threshold_idx])

        print(f"Points within 500 meters of point {i}: {within_threshold_idx}, Distances: {distances[within_threshold_idx]}")

        ref_hotel_name = hotel_name_small[i].flatten()

        # Compute similarity scores for points within the threshold
        similarity_scores_for_point = []
        for j in range(len(within_threshold_idx)):
            closest_hotel_name = hotel_name_large[within_threshold_idx[j]].flatten()

            # Compute the similarity score using fuzzy string matching
            similarity_score = fuzz.token_set_ratio(str(ref_hotel_name), str(closest_hotel_name), processor=utils.default_process)
            similarity_scores_for_point.append(similarity_score)

            if similarity_score >= 85.1:
                print(f"Similarity score between '{ref_hotel_name}' and '{closest_hotel_name}': {similarity_score}")
        
        similarity_scores.append(similarity_scores_for_point)

        print("-----------")

        if i > 250:
            break

In [63]:
def match_hotels(booking_df, traffic_df):

    # Initialize arrays to store the 10 closest indices and distances for each point
    latlon_small = np.vstack([np.array(booking_df["latitude"]), np.array(booking_df["longitude"])]).T # Now (3906, 2)
    latlon_large = np.vstack([np.array(traffic_df["latitude"]), np.array(traffic_df["longitude"])]).T  # Now (625138, 2)
    # Compute the pairwise Euclidean distances

    #hotel_name_small = bookingHotelNames.T
    hotel_name_small = np.vstack([np.array(booking_df["cleanedHotelName"])]).T
    hotel_name_large = np.vstack([np.array(traffic_df["cleanedHotelName"])]).T

    booking_urls = np.vstack([np.array(booking_df["url"])]).T
    traffic_places_id = np.vstack([np.array(traffic_df["places_id"])]).T

    # Extract image lists and ensure correct format
    booking_images = np.array(booking_df["images"], dtype=object)

    num_small = latlon_small.shape[0]

    closest_indices = []
    closest_distances = []
    similarity_scores = []

    distance_threshold = 0.0045

    # Loop through each point in the smaller array
    for i in range(num_small):
        # Compute distances between the ith point in latlon_small and all points in latlon_large
        distances = cdist(latlon_small[i:i+1], latlon_large).flatten()  # distances shape (625138,)

        # Filter indices where the distance is within 500 meters (threshold)
        within_threshold_idx = np.where(distances <= distance_threshold)[0]
        
        # Store the closest indices and corresponding distances
        closest_indices.append(within_threshold_idx)
        closest_distances.append(distances[within_threshold_idx])

        #print(f"Points within 500 meters of point {i}: {within_threshold_idx}, Distances: {distances[within_threshold_idx]}")

        ref_hotel_name = hotel_name_small[i].flatten()

        # Compute similarity scores for points within the threshold
        similarity_scores_for_point = []
        for j in range(len(within_threshold_idx)):
            closest_hotel_name = hotel_name_large[within_threshold_idx[j]].flatten()
            
            # Compute the similarity score using fuzzy string matching
            similarity_score = fuzz.token_set_ratio(str(ref_hotel_name), str(closest_hotel_name), processor=utils.default_process)
            similarity_scores_for_point.append(similarity_score)

            '''if similarity_score >= 85.1:
                print(f"Similarity score between '{ref_hotel_name}' and '{closest_hotel_name}': {similarity_score}")'''
        
        similarity_scores.append(similarity_scores_for_point)

        #print("-----------")

    # Ensure closest_indices and closest_distances have the same length as num_small
    #num_small = 252
    '''if len(closest_indices) != num_small or len(closest_distances) != num_small:
        print("Error: Length mismatch between closest_indices/closest_distances and num_small.")
        print(f"Expected length: {num_small}, closest_indices length: {len(closest_indices)}, closest_distances length: {len(closest_distances)}")
        raise ValueError("Mismatch in lengths between closest_indices/closest_distances and num_small.")
    '''

    # Initialize the dictionary to store matches
    matches = {}

    # Define the similarity score threshold for a match
    similarity_threshold = 100

    # Loop through each point in the smaller dataset (bookingData)
    for i in range(num_small):
        #print("here2")
        # Safety check in case of indexing issues
        if i >= len(closest_indices) or i >= len(closest_distances):
            print(f"Skipping index {i} due to out-of-range access.")
            continue

        booking_hotel_name = str(hotel_name_small[i][0])  # Get the booking hotel name as a string

        # Check if there are matches within the threshold for this booking hotel
        if len(closest_indices[i]) == 0:
            continue

        # List to hold matches for the current booking hotel
        matched_hotels = []

        # Iterate over each matching traffic hotel within the distance threshold
        for j, idx in enumerate(closest_indices[i]):
            #print("here3")
            similarity_score = similarity_scores[i][j]
          
            # Only add matches that meet or exceed the similarity threshold
            if similarity_score >= similarity_threshold:
                # Prepare match details
                traffic_hotel_name = str(hotel_name_large[idx][0])  # Traffic hotel name as string
                lat, lon = latlon_large[idx]  # Traffic hotel coordinates
                dist = closest_distances[i][j]  # Distance to booking hotel
                b_url = booking_urls[i][0]
                b_index = booking_df["index"][i]
                b_images = booking_images[i]

                # Append the match details to the list
                matched_hotels.append({
                    "index": b_index,
                    "traffic_hotel_name": traffic_hotel_name,
                    "url": b_url,
                    "latitude": lat,
                    "longitude": lon,
                    "similarity_score": similarity_score,
                    "distance": dist,
                    "images": b_images
                })

        # Only add the booking hotel to the dictionary if there are matches
        if matched_hotels:
            matches[booking_hotel_name] = matched_hotels
            
    matched_booking_hotels = []
    # Print out the first few matches for inspection
    for booking_hotel, traffic_matches in list(matches.items())[:100]:
        print(f"Booking Hotel: {booking_hotel}")
        for match in traffic_matches:
            print(f"  Matched Traffic Hotel: {match['traffic_hotel_name']}, "
                f"Latitude: {match['latitude']}, Longitude: {match['longitude']}, "
                f"Score: {match['similarity_score']}, Distance: {match['distance']}")
        matched_booking_hotels.append(booking_hotel)
        print("-----")

    return matches


In [79]:
def hash_url(url):
    hash_value = hashlib.md5(str(url).encode('utf-8')).hexdigest()
    return hash_value

In [80]:
# Checks to see if the url has has already had its images downloaded from it
def predownload_check(hash_to_check):
    # Ensure the file exists
    file_path = "downloaded_hashes.txt"
    if not os.path.exists(file_path):
        with open(file_path, 'w') as f:
            pass  # Create an empty file
    
    # Read the file and check for the hash
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    check = hash_to_check in (line.strip() for line in lines)
    print(check)
    return check

In [81]:
def get_sharded_path(root_directory, index, levels=3):
    """Generate a sharded directory path based on an index."""
    index_str = f"{index:06d}"  # Zero-padded index (e.g., 000123)
    shards = [index_str[i:i+2] for i in range(0, levels * 2, 2)]  # Create 2-char directory levels
    return os.path.join(root_directory, *shards)

In [86]:
def generate_hashed_directory_name(url):
    """Generate a directory name using a random number and a hashed version of the URL."""
    random_number = random.randint(100000, 999999)  # Generate a random 6-digit number
    url_hash = hashlib.md5(url.encode('utf-8')).hexdigest()  # Hash the URL (first 8 chars for brevity)
    return f"{random_number}_{url_hash}"

In [83]:
def download_single_image(url, save_directory):
    try:
        # Get the file name from the URL
        parsed_url = urlparse(url)
        file_name = os.path.basename(parsed_url.path)
        save_path = os.path.join(save_directory, file_name)

        # Ensure the save directory exists
        os.makedirs(save_directory, exist_ok=True)

        # Download the image
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an error for HTTP errors

        # Save the image
        with open(save_path, 'wb') as image_file:
            for chunk in response.iter_content(1024):
                image_file.write(chunk)
        print(f"Downloaded: {url} -> {save_path}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")

In [None]:
def download_all_images(matches_dict, root_directory, log_file = "downloaded_hashes.txt"):

    with open(log_file, "a") as log:
        for hotel_name, matched_hotels in matches_dict.items():
            for match in matched_hotels:
                hotel_index = match["index"]
                image_urls = match["images"]
                match_url = match["url"]

                if not image_urls:
                    print(f"No images found for hotel: {hotel_name}")
                    continue

                parent_directory = get_sharded_path(root_directory, hotel_index)

                lowest_level_directory = generate_hashed_directory_name(match_url)
                save_directory = os.path.join(parent_directory, lowest_level_directory)

                os.makedirs(save_directory, exist_ok=True)
                hashed_url = hash_url(match_url)

                if predownload_check(hashed_url) == False:
                    for image_url in image_urls:
                        download_single_image(image_url, save_directory)
                    
                    print(f"Finished downloading images for hotel: {hotel_name}")
                    log.write(hashed_url + "\n")
                else:
                    continue
                    


START OF MAIN

In [16]:
#Load TraffickCam Data
traffickCamData = loadTraffickCam("hotel_info_with_places_id.txt")
traffickCamData.head

<bound method NDFrame.head of         number                                         hotel_name   latitude  \
0          390                                  Super 8 Frederick  39.402160   
1          391  Extended Stay America - Fairbanks - Old Airpor...  64.835380   
2          392                Hilton Hangzhou Qiandao Lake Resort  29.608190   
3          393                                      Taj Lands End  19.043910   
4          394        Helzear Montparnasse Rive Gauche Apartments  48.834660   
...        ...                                                ...        ...   
625133  694001   Hyatt Regency Lake Washington at Seattle’s So...  47.503043   
625134  694002                                 The Beverly Hilton  34.066370   
625135  694003               Homewood Suites by Hilton Ronkonkoma  40.812062   
625136  694004                                  Choctaw Durant OK  33.958543   
625137  694005                                            ???????  24.125850   

         

In [33]:
# Load booking.com Data
root_dir = 'C:/Users/lucas/Desktop/VisLab/country/United-States-of-America'
bookingData = load_sharded_dataset(root_dir)
bookingData.info

after making list
after try
No files found in directory: C:/Users/lucas/Desktop/VisLab/country/United-States-of-America
No files found in directory: C:/Users/lucas/Desktop/VisLab/country/United-States-of-America\Alabama
No files found in directory: C:/Users/lucas/Desktop/VisLab/country/United-States-of-America\Alaska
No files found in directory: C:/Users/lucas/Desktop/VisLab/country/United-States-of-America\Arizona
No files found in directory: C:/Users/lucas/Desktop/VisLab/country/United-States-of-America\Arkansas
No files found in directory: C:/Users/lucas/Desktop/VisLab/country/United-States-of-America\California
No files found in directory: C:/Users/lucas/Desktop/VisLab/country/United-States-of-America\Colorado
No files found in directory: C:/Users/lucas/Desktop/VisLab/country/United-States-of-America\Connecticut
No files found in directory: C:/Users/lucas/Desktop/VisLab/country/United-States-of-America\Delaware
No files found in directory: C:/Users/lucas/Desktop/VisLab/country/Unit

<bound method DataFrame.info of                                              hotel_name  \
0               1001 Atelier Studios (Hotel), Sun Valle   
1                       1069 Atelier (Hotel), Sun Valle   
2                             112 Motel (Hotel), Medfor   
3     1886 Crescent Hotel and Spa (Hotel), Eureka Sp...   
4         1905 Basin Park Hotel (Hotel), Eureka Springs   
...                                                 ...   
4938                           motel6 (Hotel), La Cross   
4939                        royal inn (Motel), Lovelock   
4940      stayAPT Suites College Station (Hotel), Bryan   
4941                  stayAPT Suites Montgomery (Hotel)   
4942  stayAPT Suites San Antonio Randolph (Live Oak)...   

                                                address     rating  \
0                      Sun Valley, 83353, United States        7.8   
1                      Sun Valley, 83353, United States          9   
2     2001 Route 112, Medford, NY 11763, United S

In [34]:
bookingData.info

<bound method DataFrame.info of                                              hotel_name  \
0               1001 Atelier Studios (Hotel), Sun Valle   
1                       1069 Atelier (Hotel), Sun Valle   
2                             112 Motel (Hotel), Medfor   
3     1886 Crescent Hotel and Spa (Hotel), Eureka Sp...   
4         1905 Basin Park Hotel (Hotel), Eureka Springs   
...                                                 ...   
4938                           motel6 (Hotel), La Cross   
4939                        royal inn (Motel), Lovelock   
4940      stayAPT Suites College Station (Hotel), Bryan   
4941                  stayAPT Suites Montgomery (Hotel)   
4942  stayAPT Suites San Antonio Randolph (Live Oak)...   

                                                address     rating  \
0                      Sun Valley, 83353, United States        7.8   
1                      Sun Valley, 83353, United States          9   
2     2001 Route 112, Medford, NY 11763, United S

In [44]:
cleanBookingData, cleanTrafficData = clean_hotel_names(bookingData, traffickCamData)

<class 'numpy.ndarray'>


In [45]:
print(cleanBookingData)

      index                    cleanedHotelName   latitude   longitude  \
0         0         1001elier studios sun valle  43.695167 -114.348964   
1         1                 1069elier sun valle  43.695215 -114.347752   
2         2                         112  medfor  40.806998  -73.002407   
3         3    1886 crescent spa eureka springs  36.408375  -93.737454   
4         4     1905 basin park  eureka springs  36.402324  -93.737032   
...     ...                                 ...        ...         ...   
4938   4938                          6 la cross  43.854096  -91.245683   
4939   4939                      royal lovelock  40.185105 -118.471391   
4940   4940       stayapt college station bryan  30.647879  -96.302921   
4941   4941                  stayapt montgomery  32.357944  -86.218679   
4942   4942  stayapt santonio randolph live oak  29.573543  -98.336480   

                                                    url  \
0     https://www.booking.com/hotel/us/1001-atelier-

In [37]:
cleanTrafficData.head

<bound method NDFrame.head of                                        cleanedHotelName   latitude  \
0                                     super 8 frederick  39.402160   
1       extended stay america fairbanks old airport way  64.835380   
2                   hilton hangzhou qiandao lake resort  29.608190   
3                                         taj lands end  19.043910   
4           helzear montparnasse rive gauche apartments  48.834660   
...                                                 ...        ...   
625133      hyatt regency lake washington seattle’sport  47.503043   
625134                               the beverly hilton  34.066370   
625135                       homewood hilton ronkonkoma  40.812062   
625136                                choctaw durant ok  33.958543   
625137                                                   24.125850   

         longitude                    places_id  
0       -77.408420  ChIJCWJjR0HayYkRAfWSLQl85IQ  
1      -147.823300  ChIJqbFJV

In [50]:
matches = match_hotels(cleanBookingData, cleanTrafficData)
print(len(matches))

Booking Hotel: 112  medfor
  Matched Traffic Hotel: 112, Latitude: 40.80717, Longitude: -73.00297, Score: 100.0, Distance: 0.0005886875232237542
-----
Booking Hotel: 1886 crescent spa eureka springs
  Matched Traffic Hotel: 1886 crescent  spa, Latitude: 36.40755, Longitude: -93.73902, Score: 100.0, Distance: 0.0017695296342504104
-----
Booking Hotel: 1905 basin park  eureka springs
  Matched Traffic Hotel: basin park, Latitude: 36.4022, Longitude: -93.73719, Score: 100.0, Distance: 0.0002007286051679119
-----
Booking Hotel: 1912  sumter
  Matched Traffic Hotel: 1912, Latitude: 33.92623, Longitude: -80.3431, Score: 100.0, Distance: 0.0001734156855686754
-----
Booking Hotel: 21c museum cincinnati
  Matched Traffic Hotel: 21c museum cincinnati, Latitude: 39.10313, Longitude: -84.5118, Score: 100.0, Distance: 0.00010397228958961612
-----
Booking Hotel: a cape cod ocean manor hyannis
  Matched Traffic Hotel: cape cod ocean manor, Latitude: 41.63892, Longitude: -70.28002, Score: 100.0, Dista

In [52]:
print(matches["candlewood alabaster"])

[{'index': 554, 'traffic_hotel_name': 'candlewood alabaster', 'url': 'https://www.booking.com/hotel/us/candlewood-suites-alabaster.html?aid=304142&label=gen173nr-1FCAMo7AFCCWFsYWJhc3RlckgzWARonQKIAQGYATG4AQfIAQ_YAQHoAQH4AQKIAgGoAgO4AqOtsrYGwAIB0gIkMjdmOGMwMjMtNzk1Zi00MmRlLTgwYzgtYWE5ZWI1MThiNjE42AIF4AIB&sid=4f648819a52e44b88635f568c13b1d71&dest_id=20000038&dest_type=city&dist=0&group_adults=2&group_children=0&hapos=2&hpos=2&no_rooms=1&req_adults=2&req_children=0&room1=A%2CA&sb_price_type=total&sr_order=popularity&srepoch=1724684074&srpvid=7a6268cf5f820527&type=total&ucfs=1&activeTab=photosGallery#close-lightbox', 'latitude': 33.2329, 'longitude': -86.80002, 'similarity_score': 100.0, 'distance': 0.0007507592756763593, 'images': ['https://cf.bstatic.com/xdata/images/hotel/max1024x768/234835142.jpg?k=57aebee39c82bc465ca7168c7ff403518ce7bd05bbda5801f38a3d3dc9d026d8&o=&hp=1', 'https://cf.bstatic.com/xdata/images/hotel/max1024x768/325325025.jpg?k=622ea9ff3c20e73d073f4c8e41122898c8b8c09c91ac

In [53]:
# Testing to see that the hashing works
hash_value = hashlib.md5("https://www.booking.com/hotel/us/candlewood-suites-alabaster.html?aid=304142&label=gen173nr-1FCAMo7AFCCWFsYWJhc3RlckgzWARonQKIAQGYATG4AQfIAQ_YAQHoAQH4AQKIAgGoAgO4AqOtsrYGwAIB0gIkMjdmOGMwMjMtNzk1Zi00MmRlLTgwYzgtYWE5ZWI1MThiNjE42AIF4AIB&sid=4f648819a52e44b88635f568c13b1d71&dest_id=20000038&dest_type=city&dist=0&group_adults=2&group_children=0&hapos=2&hpos=2&no_rooms=1&req_adults=2&req_children=0&room1=A%2CA&sb_price_type=total&sr_order=popularity&srepoch=1724684074&srpvid=7a6268cf5f820527&type=total&ucfs=1&activeTab=photosGallery#close-lightbox".encode('utf-8')).hexdigest()

print(hash_value)

hash_check_bool = predownload_check(hash_value)

b2ee652b30766d8a59c7f62d8c0e14c5
False


In [89]:
root_directory = "C:/Users/lucas/Desktop/VisLab/booking_images"

download_all_images(matches, root_directory)

True
True
False
Downloaded: https://cf.bstatic.com/xdata/images/hotel/max1024x768/470429656.jpg?k=c3a12d9b20ae9be4fed717b2b25f584c2a2463ca76ee28f53afeda5822687626&o= -> C:/Users/lucas/Desktop/VisLab/booking_images\00\00\04\434412_f92398fd44987d294a222f15bca78fc6\470429656.jpg
Downloaded: https://cf.bstatic.com/xdata/images/hotel/max1024x768/470429656.jpg?k=c3a12d9b20ae9be4fed717b2b25f584c2a2463ca76ee28f53afeda5822687626&o=&hp=1 -> C:/Users/lucas/Desktop/VisLab/booking_images\00\00\04\434412_f92398fd44987d294a222f15bca78fc6\470429656.jpg
Downloaded: https://cf.bstatic.com/xdata/images/hotel/max1024x768/470430284.jpg?k=d74f20619a675f596079d0896cf4f5425ad4b28c84e7d15e9e0dbac4699feefc&o=&hp=1 -> C:/Users/lucas/Desktop/VisLab/booking_images\00\00\04\434412_f92398fd44987d294a222f15bca78fc6\470430284.jpg
Downloaded: https://cf.bstatic.com/xdata/images/hotel/max1024x768/470434044.jpg?k=17ddbad3df261b19492db967756d114daafc5ca51f3950432eb66d903242839b&o=&hp=1 -> C:/Users/lucas/Desktop/VisLab/boo

KeyboardInterrupt: 