In [1]:
import pandas as pd
import ast
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

In [2]:
def geocode_location(location_str):
    geolocator = Nominatim(user_agent="attraction_finder")
    location = geolocator.geocode(location_str)
    return (location.latitude, location.longitude) if location else None


In [3]:
def find_nearby_restaurants(user_location_str, df, radius_km=10):
    user_coords = geocode_location(user_location_str)
    if not user_coords:
        return pd.DataFrame()
    
    def compute_distance(row):
        if pd.notnull(row['latitude']) and pd.notnull(row['longitude']):
            return geodesic(user_coords, (row['latitude'], row['longitude'])).km
        return float('inf')

    df['distance_km'] = df.apply(compute_distance, axis=1)
    nearby_df = df[df['distance_km'] <= radius_km].sort_values(by='distance_km')
    return nearby_df[['Name', 'Address', 'Duration', 'Rating', 'Ranking', 'distance_km']]


In [4]:
def generate_fallbacks(address):
    """Tr·∫£ v·ªÅ danh s√°ch c√°c ƒë·ªãa ch·ªâ fallback theo m·ª©c ƒë·ªô ƒë∆°n gi·∫£n d·∫ßn"""
    parts = [p.strip() for p in address.split(',') if p.strip()]
    fallbacks = []

    if len(parts) >= 4:
        fallbacks.append(', '.join(parts[-4:]))
    if len(parts) >= 3:
        fallbacks.append(', '.join(parts[-3:]))
    if len(parts) >= 2:
        fallbacks.append(', '.join(parts[-2:]))
    if len(parts) >= 1:
        fallbacks.append(parts[-1])

    return fallbacks


# H√†m geocode c√≥ fallback t·ª± ƒë·ªông cho SEA
def add_lat_lon_with_fallback(df, sample_size=None):
    geolocator = Nominatim(user_agent="attraction_finder_sea", timeout=10)
    latitudes = []
    longitudes = []

    df_sample = df.head(sample_size).copy()

    for i, row in df_sample.iterrows():
        raw_address = row['Address']
        try:
            print(f"[{i+1}/{len(df_sample)}] Geocoding: {raw_address}")
            location = geolocator.geocode(raw_address)

            if location:
                latitudes.append(location.latitude)
                longitudes.append(location.longitude)
                print(f"   ‚úÖ Found: ({location.latitude}, {location.longitude})")
            else:
                # fallback nhi·ªÅu c·∫•p ƒë·ªô
                fallbacks = generate_fallbacks(raw_address)
                found = False

                for fb in fallbacks:
                    print(f"   ‚Ü™Ô∏è Trying fallback: {fb}")
                    fb_loc = geolocator.geocode(fb)
                    if fb_loc:
                        latitudes.append(fb_loc.latitude)
                        longitudes.append(fb_loc.longitude)
                        print(f"   ‚úÖ Fallback Success: ({fb_loc.latitude}, {fb_loc.longitude})")
                        found = True
                        break

                if not found:
                    latitudes.append(None)
                    longitudes.append(None)
                    print(f"   ‚ùå All Fallbacks Failed for: {raw_address}")
        except Exception as e:
            latitudes.append(None)
            longitudes.append(None)
            print(f"   ‚ùå Error for '{raw_address}': {e}")

    df_sample['latitude'] = latitudes
    df_sample['longitude'] = longitudes
    return df_sample



In [5]:
import os

def process_in_chunks_resumable(df, chunk_size=5000, start_index=0, output_folder="geo_batches_attractions"):
    os.makedirs(output_folder, exist_ok=True)

    total = len(df)
    num_chunks = (total - start_index) // chunk_size + 1

    for i in range(num_chunks):
        chunk_start = start_index + i * chunk_size
        chunk_end = min(chunk_start + chunk_size, total)
        chunk_filename = f"{output_folder}/geo_results_batch_{i+1}.csv"

        # B·ªè qua n·∫øu file ƒë√£ t·ªìn t·∫°i
        if os.path.exists(chunk_filename):
            print(f"‚úÖ ƒê√£ c√≥: {chunk_filename}, b·ªè qua...")
            continue

        df_chunk = df.iloc[chunk_start:chunk_end]
        print(f"\nüì¶ ƒêang x·ª≠ l√Ω d√≤ng {chunk_start} ƒë·∫øn {chunk_end}")

        df_processed = add_lat_lon_with_fallback(df_chunk, sample_size=None)
        df_processed.to_csv(chunk_filename, index=False)
        print(f"üíæ ƒê√£ l∆∞u: {chunk_filename}")


In [6]:
df_all = pd.read_csv("Attraction_Data\Attraction_Tripadvisor_Data_SEA_cleanedforweb.csv")


In [7]:
import re

def clean_thai_address(address):
    """
    X·ª≠ l√Ω 'Moo' trong ƒë·ªãa ch·ªâ Th√°i Lan ƒë·ªÉ c·∫£i thi·ªán kh·∫£ nƒÉng geocode.
    """
    if pd.isnull(address):
        return address
    # X√≥a c·ª•m 'Moo xx' (c√≥ th·ªÉ vi·∫øt 'Moo.10', 'Moo 10',...)
    address = re.sub(r'\bMoo\.?\s*\d+\b', '', address, flags=re.IGNORECASE)
    # X√≥a d·∫•u ',' ho·∫∑c kho·∫£ng tr·∫Øng d∆∞ th·ª´a
    address = re.sub(r'\s*,\s*', ', ', address)
    address = re.sub(r'\s{2,}', ' ', address)
    return address.strip(", ")

df_all['Address'] = df_all['Address'].apply(clean_thai_address)

In [8]:
# Ch·∫°y theo batch 5.000 d√≤ng ‚Üí b·ªè qua nh·ªØng batch ƒë√£ x·ª≠ l√Ω
process_in_chunks_resumable(df_all, chunk_size=5000)


‚úÖ ƒê√£ c√≥: geo_batches_attractions/geo_results_batch_1.csv, b·ªè qua...
‚úÖ ƒê√£ c√≥: geo_batches_attractions/geo_results_batch_2.csv, b·ªè qua...
‚úÖ ƒê√£ c√≥: geo_batches_attractions/geo_results_batch_3.csv, b·ªè qua...
‚úÖ ƒê√£ c√≥: geo_batches_attractions/geo_results_batch_4.csv, b·ªè qua...
‚úÖ ƒê√£ c√≥: geo_batches_attractions/geo_results_batch_5.csv, b·ªè qua...
‚úÖ ƒê√£ c√≥: geo_batches_attractions/geo_results_batch_6.csv, b·ªè qua...
‚úÖ ƒê√£ c√≥: geo_batches_attractions/geo_results_batch_7.csv, b·ªè qua...
‚úÖ ƒê√£ c√≥: geo_batches_attractions/geo_results_batch_8.csv, b·ªè qua...
‚úÖ ƒê√£ c√≥: geo_batches_attractions/geo_results_batch_9.csv, b·ªè qua...
‚úÖ ƒê√£ c√≥: geo_batches_attractions/geo_results_batch_10.csv, b·ªè qua...

üì¶ ƒêang x·ª≠ l√Ω d√≤ng 50000 ƒë·∫øn 55000
[50001/5000] Geocoding: Bali, Indonesia
   ‚úÖ Found: (-8.2271303, 115.1919203)
[50002/5000] Geocoding: Bali, Indonesia
   ‚úÖ Found: (-8.2271303, 115.1919203)
[50003/5000] Geocoding: Bali, Indonesia
   

In [10]:
import glob

all_batches = sorted(glob.glob("geo_batches_attractions/geo_results_batch_*.csv"))
df_full = pd.concat((pd.read_csv(f) for f in all_batches), ignore_index=True)

df_full.to_csv("D:\\20242\\Web_GProject\\data\\attractions.csv", index=False)


In [12]:
from geopy.distance import geodesic

# Geocode ƒë·ªãa ƒëi·ªÉm ng∆∞·ªùi d√πng nh·∫≠p
def get_coordinates_from_address(address):
    geolocator = Nominatim(user_agent="user_location_finder", timeout=10)
    location = geolocator.geocode(address)
    if location:
        return (location.latitude, location.longitude)
    else:
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y t·ªça ƒë·ªô cho ƒë·ªãa ch·ªâ: {address}")
        return None

# T√¨m kh√°ch s·∫°n g·∫ßn ƒë·ªãa ƒëi·ªÉm ƒë√≥ nh·∫•t
def find_nearby_hotels(user_input_address, df, radius_km=10):
    user_coords = get_coordinates_from_address(user_input_address)
    if not user_coords:
        return pd.DataFrame()
    
    # Ch·ªâ l·∫•y nh·ªØng kh√°ch s·∫°n ƒë√£ c√≥ t·ªça ƒë·ªô
    df_valid = df.dropna(subset=['latitude', 'longitude']).copy()

    # T√≠nh kho·∫£ng c√°ch t·ª´ng kh√°ch s·∫°n ƒë·∫øn v·ªã tr√≠ ng∆∞·ªùi d√πng
    df_valid['distance_km'] = df_valid.apply(
        lambda row: geodesic(user_coords, (row['latitude'], row['longitude'])).km,
        axis=1
    )

    # L·ªçc theo b√°n k√≠nh v√† s·∫Øp x·∫øp
    nearby = df_valid[df_valid['distance_km'] <= radius_km]
    return nearby.sort_values('distance_km')[['Name', 'Address', 'Price', 'Overall Rating Value', 'distance_km']]


In [13]:
# V√≠ d·ª• ƒë·ªãa ch·ªâ ng∆∞·ªùi d√πng nh·∫≠p:
user_location = "Chonburi"

# T√¨m kh√°ch s·∫°n g·∫ßn trong b√°n k√≠nh 10 km
nearby_hotels = find_nearby_hotels(user_location, df_full, radius_km=10)

# Hi·ªÉn th·ªã k·∫øt qu·∫£
nearby_hotels.head(10)


Unnamed: 0,Name,Address,Price,Overall Rating Value,distance_km
43822,Hotel Nikko Amata City Chonburi,"700/333 Moo 1 Klongtamru Muang, Chonburi, Thai...",80.0,4.9,0.0
44660,Baan Lanna Resort,"114/188 Samet, Chonburi, Thailand",26.0,-1.0,0.0
44661,Homer Resort Hotel,"58/53 Moo 3 Ban Bueng Sub-district, Chonburi, ...",11.0,-1.0,0.0
44662,Tassana Place Boutique Hotel Bansuan,"62/333 Kutichi M.2, Chonburi, Thailand",24.0,-1.0,0.0
44663,Bangsean Royal Beach by Preyaluk,"148 Moo 13, Bangsaen Road Line 1, Saensuk Subd...",Unknown price,-1.0,0.0
44665,Narasiri,145/14 Moo.4 Prasoet Rat Phatthana 5 Bannongch...,Unknown price,-1.0,0.0
44666,Don Khun Wang Mansion 2,"59/99 Moo 2 T.Don Hua Roh, Chonburi, Thailand",14.0,-1.0,0.0
44667,Premiercondo Chonburi,Phraya Satcha Rd. 250 Soi Ban Suan-Phraya Satc...,48.0,-1.0,0.0
44668,Nanglen Kan Him Risxrth,"128/29 M.3 Th. Phraya Sac Ca, Chonburi, Thailand",27.0,-1.0,0.0
44670,TPR51 Room Service,"51 Bang Saen Sai 2 Road Soi 12, Chonburi, Thai...",26.0,-1.0,0.0
