In [1]:
import numpy as np
import pandas as pd
import os
import hashlib

In [2]:
def loadTraffickCam(path):
    with open(path, 'r', encoding='utf-8') as file:  # Specify utf-8 encoding
        txt_data = file.readlines()
    
    txt_data = [line.strip().split('\t') for line in txt_data]

    columns = ['number', 'hotel_name', 'latitude', 'longitude', 'places_id']
    txt_data_df = pd.DataFrame(txt_data, columns=columns)

    txt_data_df['longitude'] = pd.to_numeric(txt_data_df['longitude'], errors='coerce')
    txt_data_df['latitude'] = pd.to_numeric(txt_data_df['latitude'], errors='coerce')
    txt_data_df.dropna(subset=['longitude', 'latitude'], inplace=True)

    return txt_data_df

In [3]:
#Load TraffickCam Data
traffickCamData = loadTraffickCam("hotel_info_with_places_id.txt")
traffickCamData.head

<bound method NDFrame.head of         number                                         hotel_name   latitude  \
0          390                                  Super 8 Frederick  39.402160   
1          391  Extended Stay America - Fairbanks - Old Airpor...  64.835380   
2          392                Hilton Hangzhou Qiandao Lake Resort  29.608190   
3          393                                      Taj Lands End  19.043910   
4          394        Helzear Montparnasse Rive Gauche Apartments  48.834660   
...        ...                                                ...        ...   
625133  694001   Hyatt Regency Lake Washington at Seattle’s So...  47.503043   
625134  694002                                 The Beverly Hilton  34.066370   
625135  694003               Homewood Suites by Hilton Ronkonkoma  40.812062   
625136  694004                                  Choctaw Durant OK  33.958543   
625137  694005                                            ???????  24.125850   

         

In [4]:
# Load sharded data
def load_sharded_dataset(root_dir):
    all_data = []
    print("after making list")
    try:
        print("after try")
        for root, dirs, files in os.walk(root_dir):
            #print("in for loop")
            #print(f"Scanning directory: {root}")  # Log current directory being scanned
            
            if not files:
                print(f"No files found in directory: {root}")
                continue  # Skip directories with no files
            
            for file in files:
                file_path = os.path.join(root, file)
                #print(f"Attempting to load file: {file_path}")  # Log which file is being loaded
                
                # Check file extension and load accordingly
                if file.endswith('.csv'):
                    try:
                        data = pd.read_csv(file_path)
                        all_data.append(data)
                        #print(f"Successfully loaded {file_path} with {len(data)} records.")
                    except Exception as e:
                        print(f"Error loading {file_path}: {e}")
                elif file.endswith('.json'):
                    try:
                        data = pd.read_json(file_path)
                        all_data.append(data)
                        #print(f"Successfully loaded {file_path} with {len(data)} records.")
                    except Exception as e:
                        print(f"Error loading {file_path}: {e}")
                else:
                    print(f"Skipping unsupported file format: {file_path}")
    
    except Exception as e:
        print(f"Exception occurred during directory traversal: {e}")
    
    # Ensure we're returning a single combined DataFrame
    if all_data:
        combined_data = pd.concat(all_data, ignore_index=True)
        combined_data.drop_duplicates(inplace=True, keep = 'first')
        combined_data.drop('images', axis=1, inplace=True)
        combined_data['hotel_name'] = combined_data['hotel_name'].str.replace('-', ' ')
        combined_data.drop_duplicates(keep = 'first', inplace=True)
        print(f"Loaded {len(all_data)} files, combined into one DataFrame.")
        return combined_data
    else:
        print("No data loaded.")
        return None
    

In [6]:
# Load booking.com Data
root_dir = 'country/United-States-of-America'
bookingData = load_sharded_dataset(root_dir)

after making list
after try
No files found in directory: country/United-States-of-America
No files found in directory: country/United-States-of-America\Alabama
No files found in directory: country/United-States-of-America\Alaska
No files found in directory: country/United-States-of-America\Arizona
No files found in directory: country/United-States-of-America\Arkansas
No files found in directory: country/United-States-of-America\California
No files found in directory: country/United-States-of-America\Colorado
No files found in directory: country/United-States-of-America\Connecticut
No files found in directory: country/United-States-of-America\Delaware
No files found in directory: country/United-States-of-America\Florida
No files found in directory: country/United-States-of-America\Georgia
No files found in directory: country/United-States-of-America\Idaho
No files found in directory: country/United-States-of-America\Illinois
No files found in directory: country/United-States-of-America\

In [7]:
def split_gps_column(sharded_data):
    if 'gps' in sharded_data.columns:
        # Split the 'gps' column on the delimiter (assuming comma, adjust if necessary)
        sharded_data[['latitude', 'longitude']] = sharded_data['gps'].str.split(',', expand=True)
        
        # Convert latitude and longitude to numeric types
        sharded_data['latitude'] = pd.to_numeric(sharded_data['latitude'], errors='coerce')
        sharded_data['longitude'] = pd.to_numeric(sharded_data['longitude'], errors='coerce')

        # Drop the 'gps' column after splitting
        sharded_data.drop('gps', axis=1, inplace=True)
    
    return sharded_data

In [8]:
split_gps_column(bookingData)

Unnamed: 0,hotel_name,date_accessed,address,rating,url,latitude,longitude
0,"Candlewood Suites Alabaster, an IHG Hotel (Hotel)",2024-08-26,"1004 Balmoral Drive, Alabaster, AL 35007, Unit...",8.1,https://www.booking.com/hotel/us/candlewood-su...,33.232859,-86.799270
67,Fairfield by Marriott Inn & Suites Albertville...,2024-10-15,"328 George Wallace Drive, Albertville, 35951, ...",9.1,https://www.booking.com/hotel/us/fairfield-by-...,34.294313,-86.252364
102,Microtel Inn & Suites by Wyndham Albertville (...,2024-10-27,"220 Alabama Highway 75 North, Albertville, AL ...",8.3,https://www.booking.com/hotel/us/microtel-inn-...,34.276824,-86.203203
135,InTown Suites Extended Stay Birmingham AL Ho...,2024-08-29,"1840 Southpark Drive, Birmingham, AL 35244, Un...",7,https://www.booking.com/hotel/us/intown-suites...,33.349074,-86.794395
169,Econo Lodge Boaz (Hotel),2024-10-27,"761 US Hwy 431, Boaz, 35957, United States of ...",7.5,https://www.booking.com/hotel/us/econo-lodge-b...,34.197369,-86.151096
...,...,...,...,...,...,...,...
255764,"The Hatchet Resort (Resort), Moran",2024-07-01,"19980 East Hwy 287, Moran, WY 83013, United St...",7.8,https://www.booking.com/hotel/us/the-hatchet-r...,43.825107,-110.358680
255834,"Togwotee Mountain Lodge (Lodge), Moran",2024-07-01,27655 Highway 26 & 287 (16.5 miles east of Gra...,7.2,https://www.booking.com/hotel/us/togwotee-moun...,43.818566,-110.199829
255878,Cobblestone Hotel & Suites Torrington (Hotel),2024-09-07,"1306 Main St, Torrington, WY 82240, United Sta...",8.6,https://www.booking.com/hotel/us/cobblestone-a...,42.058363,-104.184830
255971,"Holiday Inn Express Hotel & Suites Torrington,...",2024-09-07,"1700 East Valley Road, Torrington, WY 82240, U...",7.8,https://www.booking.com/hotel/us/holiday-inn-e...,42.053325,-104.162968


In [9]:
latlng = np.vstack([np.array(bookingData["latitude"]), np.array(bookingData["longitude"])])

trafficlatlng = np.vstack([np.array(traffickCamData["latitude"]), np.array(traffickCamData["longitude"])])

bookingHotelNames = np.vstack([np.array(bookingData["hotel_name"])])

trafficHotelNames = np.vstack([np.array(traffickCamData["hotel_name"])])

bookingHotelURL = np.vstack([np.array(bookingData["url"])])

trafficHotelID = np.vstack([np.array(traffickCamData["places_id"])])

latlng.shape,trafficlatlng.shape

bookingHotelNames.shape,trafficHotelNames.shape

print(bookingHotelNames[:10])
print(trafficHotelID[:10])

[['Candlewood Suites Alabaster, an IHG Hotel (Hotel)'
  'Fairfield by Marriott Inn & Suites Albertville (Hotel)'
  'Microtel Inn & Suites by Wyndham Albertville (Hotel)' ...
  'Cobblestone Hotel & Suites   Torrington (Hotel)'
  'Holiday Inn Express Hotel & Suites Torrington, an IHG Hotel (Hotel)'
  "King's Inn (Hotel), Torrington"]]
[['ChIJCWJjR0HayYkRAfWSLQl85IQ' 'ChIJqbFJVhdbMlERYHgGtF-Gv8w'
  'ChIJDZWWXAOLSTQR4JoKtpTbZEE' ... 'ChIJY2Ubpu836IkRqyIH_SEtfo4' None
  'ChIJnbcpvw8oZjQR7Lax5V6ivvY']]


In [10]:
def remove_after_near(s):
    index = s.lower().find("near")
    return s[:index].strip() if index != -1 else s

In [11]:
# List of phrases to omit
phrases_to_omit = ["hotel", "motel", "bed and breakfast", "inn ", "&", "suites", " and", " by", " or", 
                   " the", " an", "ihg", " at", " area", " north", " south", " east", "west "]

# Makes the hotel names into a numpy array
cleaned_hotel_names = np.array([
    np.array([name for name in bookingHotelNames[0]])
])

cleaned_hotel_names_traffic = np.array([
    np.array([name for name in trafficHotelNames[0]])
])

# Takes out all punctuation
unwanted_chars = "()-.,!?"
translation_table = str.maketrans('', '', unwanted_chars)
cleaned_hotel_names = np.char.translate(cleaned_hotel_names, translation_table)
cleaned_hotel_names_traffic = np.char.translate(cleaned_hotel_names_traffic, translation_table)

# Makes all words lower case
cleaned_hotel_names = np.char.lower(cleaned_hotel_names)
cleaned_hotel_names_traffic = np.char.lower(cleaned_hotel_names_traffic)

# Takes out all the phrases ot omit
for phrase in phrases_to_omit:
    cleaned_hotel_names = np.char.replace(cleaned_hotel_names, phrase, "")
    cleaned_hotel_names_traffic = np.char.replace(cleaned_hotel_names_traffic, phrase, "")

cleaned_hotel_names = np.char.replace(cleaned_hotel_names, '  ', ' ')
cleaned_hotel_names_traffic = np.char.replace(cleaned_hotel_names_traffic, '  ', ' ')

# Cleans up all the spaces
while np.any(np.char.find(cleaned_hotel_names, '  ') != -1):
    cleaned_hotel_names = np.char.replace(cleaned_hotel_names, '  ', ' ')
while np.any(np.char.find(cleaned_hotel_names_traffic, '  ') != -1):
    cleaned_hotel_names_traffic = np.char.replace(cleaned_hotel_names_traffic, '  ', ' ')

# Removes all words after the word "near"
remove_after_near_vec = np.vectorize(remove_after_near)
cleaned_hotel_names = remove_after_near_vec(cleaned_hotel_names)
cleaned_hotel_names_traffic = remove_after_near_vec(cleaned_hotel_names_traffic)

# Cleaned hotel names
cleanedBookingHotelNames = np.char.strip(cleaned_hotel_names)  # Remove any leading/trailing spaces
cleanedTrafficHotelNames = np.char.strip(cleaned_hotel_names_traffic)

print(cleanedBookingHotelNames[:10])
print(cleanedTrafficHotelNames[:10])

[['candlewood alabaster' 'fairfield marriott albertville'
  'microtel wyndham albertville' ... 'cobblestone torrington'
  'holiday express torrington' "king's torrington"]]
[['super 8 frederick' 'extended stay america fairbanks old airport way'
  'hilton hangzhou qiandao lake resort' ... 'homewood hilton ronkonkoma'
  'choctaw durant ok' '']]


In [None]:
# Initialize arrays to store the 10 closest indices and distances for each point
from scipy.spatial.distance import cdist
from rapidfuzz import fuzz
from rapidfuzz import utils

latlon_small = latlng.T  # Now (3906, 2)
latlon_large = trafficlatlng.T  # Now (625138, 2)
# Compute the pairwise Euclidean distances

#hotel_name_small = bookingHotelNames.T
hotel_name_small = cleanedBookingHotelNames.T
hotel_name_large = cleanedTrafficHotelNames.T

booking_urls = bookingHotelURL.T
traffic_places_id = trafficHotelID.T

unique_latlon_large = np.unique(latlon_large, axis=0)

num_small = latlon_small.shape[0]

closest_indices = []
closest_distances = []
similarity_scores = []

distance_threshold = 0.0045

# Loop through each point in the smaller array
for i in range(num_small):
    # Compute distances between the ith point in latlon_small and all points in latlon_large
    distances = cdist(latlon_small[i:i+1], latlon_large).flatten()  # distances shape (625138,)

    # Filter indices where the distance is within 500 meters (threshold)
    within_threshold_idx = np.where(distances <= distance_threshold)[0]
    
    # Store the closest indices and corresponding distances
    closest_indices.append(within_threshold_idx)
    closest_distances.append(distances[within_threshold_idx])

    print(f"Points within 500 meters of point {i}: {within_threshold_idx}, Distances: {distances[within_threshold_idx]}")

    ref_hotel_name = hotel_name_small[i].flatten()

    # Compute similarity scores for points within the threshold
    similarity_scores_for_point = []
    for j in range(len(within_threshold_idx)):
        closest_hotel_name = hotel_name_large[within_threshold_idx[j]].flatten()

        # Compute the similarity score using fuzzy string matching
        similarity_score = fuzz.token_set_ratio(str(ref_hotel_name), str(closest_hotel_name), processor=utils.default_process)
        similarity_scores_for_point.append(similarity_score)

        if similarity_score >= 85.1:
            print(f"Similarity score between '{ref_hotel_name}' and '{closest_hotel_name}': {similarity_score}")
    
    similarity_scores.append(similarity_scores_for_point)

    print("-----------")

    #if i > 250:
        #break

Points within 500 meters of point 0: [ 71625 146709 152600], Distances: [0.00075076 0.00013271 0.00099801]
Similarity score between '['candlewood alabaster']' and '['candlewood alabaster']': 100.0
Similarity score between '['candlewood alabaster']' and '['candlewood alabaster alabama']': 100.0
-----------
Points within 500 meters of point 1: [], Distances: []
-----------
Points within 500 meters of point 2: [ 4514 10971 38765 60150], Distances: [0.00143479 0.00134634 0.00345048 0.00334422]
Similarity score between '['microtel wyndham albertville']' and '['microtel wyndham albertville']': 100.0
-----------
Points within 500 meters of point 3: [ 55525 388677], Distances: [0.00182846 0.00196707]
-----------
Points within 500 meters of point 4: [156508], Distances: [0.00109385]
Similarity score between '['econo lodge boaz']' and '['econo lodge']': 100.0
-----------
Points within 500 meters of point 5: [18662], Distances: [0.00038994]
Similarity score between '['key boaz']' and '['key boaz 

In [47]:
# Ensure closest_indices and closest_distances have the same length as num_small
#num_small = 252
'''if len(closest_indices) != num_small or len(closest_distances) != num_small:
    print("Error: Length mismatch between closest_indices/closest_distances and num_small.")
    print(f"Expected length: {num_small}, closest_indices length: {len(closest_indices)}, closest_distances length: {len(closest_distances)}")
    raise ValueError("Mismatch in lengths between closest_indices/closest_distances and num_small.")'''

# Initialize the dictionary to store matches
matches = {}

# Define the similarity score threshold for a match
similarity_threshold = 100


# Loop through each point in the smaller dataset (bookingData)
for i in range(num_small):
    # Safety check in case of indexing issues
    if i >= len(closest_indices) or i >= len(closest_distances):
        print(f"Skipping index {i} due to out-of-range access.")
        continue

    booking_hotel_name = str(hotel_name_small[i][0])  # Get the booking hotel name as a string

    # Check if there are matches within the threshold for this booking hotel
    if len(closest_indices[i]) == 0:
        continue

    # List to hold matches for the current booking hotel
    matched_hotels = []

    # Iterate over each matching traffic hotel within the distance threshold
    for j, idx in enumerate(closest_indices[i]):
        similarity_score = similarity_scores[i][j]
        
        # Only add matches that meet or exceed the similarity threshold
        if similarity_score >= similarity_threshold:
            # Prepare match details
            traffic_hotel_name = str(hotel_name_large[idx][0])  # Traffic hotel name as string
            lat, lon = latlon_large[idx]  # Traffic hotel coordinates
            dist = closest_distances[i][j]  # Distance to booking hotel

            # Append the match details to the list
            matched_hotels.append({
                "traffic_hotel_name": traffic_hotel_name,
                "latitude": lat,
                "longitude": lon,
                "similarity_score": similarity_score,
                "distance": dist
            })

    # Only add the booking hotel to the dictionary if there are matches
    if matched_hotels:
        matches[booking_hotel_name] = matched_hotels

matched_booking_hotels = []
# Print out the first few matches for inspection
for booking_hotel, traffic_matches in list(matches.items())[:100]:
    print(f"Booking Hotel: {booking_hotel}")
    for match in traffic_matches:
        print(f"  Matched Traffic Hotel: {match['traffic_hotel_name']}, "
              f"Latitude: {match['latitude']}, Longitude: {match['longitude']}, "
              f"Score: {match['similarity_score']}, Distance: {match['distance']}")
    matched_booking_hotels.append(booking_hotel)
    print("-----")

print(len(matches))


Booking Hotel: candlewood alabaster
  Matched Traffic Hotel: candlewood alabaster, Latitude: 33.2329, Longitude: -86.80002, Score: 100.0, Distance: 0.0007507592756763593
  Matched Traffic Hotel: candlewood alabaster alabama, Latitude: 33.2329592, Longitude: -86.7993574, Score: 100.0, Distance: 0.00013271018801404001
-----
Booking Hotel: microtel wyndham albertville
  Matched Traffic Hotel: microtel wyndham albertville, Latitude: 34.27784, Longitude: -86.20232, Score: 100.0, Distance: 0.001346336758240169
-----
Booking Hotel: econo lodge boaz
  Matched Traffic Hotel: econo lodge, Latitude: 34.19772, Longitude: -86.15006, Score: 100.0, Distance: 0.001093845053010089
-----
Booking Hotel: key boaz
  Matched Traffic Hotel: key boaz al, Latitude: 34.19852, Longitude: -86.15788, Score: 100.0, Distance: 0.0003899357941533613
-----
Booking Hotel: hampton calera
  Matched Traffic Hotel: hampton calera, Latitude: 33.14882, Longitude: -86.75135, Score: 100.0, Distance: 0.0019757490054410974
  Matc

In [34]:
print(matched_hotels)

[]


In [32]:
import hashlib
hash = hashlib.md5(b'booking.com/url').hexdigest()
print(hash)

9cbd4f9d6788301e6299a794ceba1c95


In [50]:
hashed_booking_hotels = []

hotel_hashes = {hotel : hashlib.md5(hotel.encode('utf-8')).hexdigest() for hotel in matched_booking_hotels}

for hotel, hash_value in hotel_hashes.items():
    print(f"Hotel: {hotel} => Hash: {hash_value}")

Hotel: candlewood alabaster => Hash: aab9e2ec7ce4e73c3627d3162eb29ed0
Hotel: microtel wyndham albertville => Hash: b94c14e52ce35bcb8980c558307784e1
Hotel: econo lodge boaz => Hash: dcafc4ceac28fa01b3ae6c2d047032fd
Hotel: key boaz => Hash: 1a42759e76e64cf4bc98e4ea4140d615
Hotel: hampton calera => Hash: 32323cbe74f8086ca75d7ce9667db767
Hotel: comfort cullman i 65 exit 310 => Hash: 152ffd641d8193ffb1cb7049d8073030
Hotel: hampton cullman => Hash: 4257128f4b6f35aee9d83568cc69ccef
Hotel: la quinta wyndham cullman => Hash: afa9210454540e53f88d4b11ea410d20
Hotel: quality cullman i 65 exit 310 inn => Hash: 27863fa9b0a48c1ce768f87392df1e2a
Hotel: sleep cullman i 65 exit 310 => Hash: 71b50daf64f9d27e711fe6201403073f
Hotel: econo lodge fort payne => Hash: 4967069777848225242fc971a770fb79
Hotel: hampton guntersville => Hash: 0fe914d54be7917558e6d63fd2094aa4
Hotel: wyndham garden lake guntersville => Hash: 0a06a8cb81f22b80004654ba8c8ce44d
Hotel: courtyard birmingham hoover => Hash: 2022af76373461e4a