In [1]:
#
#  Fetching data from travelweekly to be used to help train model.
#
# >>> travelweekly.get_hotel(hotel) ->
# {'travelweekly_year_built': __, 'travelweekly_num_floors': __, 'travelweekly_num_rooms': __, 'travelweekly_chain': __, 'travelweekly_events_num_rooms': __, 'travelweekly_events_total_sqft': __, 'travelweekly_min_rate': __, 'travelweekly_max_rate': __}
#

from bs4 import BeautifulSoup
import requests
import json
import re

def get_hotel(hotel_name):
    try:
      url = 'https://www.travelweekly.com/AutoComplete.asmx/GetCompleteList'
      headers = {
          'content-type': 'application/json; charset=UTF-8',
          'origin': 'https://www.travelweekly.com',
          'referer': 'https://www.travelweekly.com/Hotels/Destinations,United-States',
          'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
      }
      data = json.dumps({"Request": {"Term": hotel_name.split(', an')[0], "Context": "HOT"}}).encode('utf-8')
      response = requests.post(url, headers=headers, data=data)
      for res in response.json()['d']:
          if res['Type'] == 'HOT' and res['URL'] != None:
              r = requests.get(f"https://www.travelweekly.com{res['URL']}", headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'})
              return extract_hotel_info(r.text)
      return {}
    except:
      return {}

def extract_hotel_info(html):
    soup = BeautifulSoup(html, 'html.parser')
    info = {}
    # 'travelweekly_year_renovated': ',
    # elif 'Year Last Renovated:' in text:
        # info['travelweekly_year_renovated'] = int(text.split('Year Last Renovated:')[1].strip().replace(',', ''))
    rooms_div = soup.find('p', class_='hotel-rooms')
    if rooms_div:
        info['travelweekly_num_rooms'] = int(rooms_div.get_text(strip=True).split('Rooms:')[1].strip().replace(',', ''))
    rates_div = soup.find('p', class_='hotel-rates')
    if rates_div:
        info['travelweekly_min_rate'] = int(rates_div.get_text(strip=True).split('Rates:')[1].split('-')[0].strip().replace(',', '').replace('$', ''))
        info['travelweekly_max_rate'] = int(rates_div.get_text(strip=True).split('Rates:')[1].split('-')[1].strip().replace(',', '').replace('$', ''))
    details_list = soup.find('div', class_='hotel-information-details')
    if details_list:
        for li in details_list.find_all('p'):
            text = li.get_text(strip=True)
            if 'Year Built:' in text:
                info['travelweekly_year_built'] = int(text.split('Year Built:')[1].strip().replace(',', ''))
            elif 'Number of Floors:' in text:
                info['travelweekly_num_floors'] = int(text.split('Number of Floors:')[1].strip().replace(',', ''))
            elif 'Chain:' in text:
                info['travelweekly_chain'] = text.split('Chain:')[1].strip()
    events_list = soup.find('div', class_='event-space row')
    if events_list:
        for p in events_list.find_all('p'):
            text = p.get_text(strip=True)
            if 'Total number of meeting rooms:' in text:
                info['travelweekly_events_num_rooms'] = int(text.split('Total number of meeting rooms:')[1].strip().replace(',', ''))
            if 'Total event space:' in text:
                info['travelweekly_events_total_sqft'] = int(text.split('Total event space:')[1].split('sq')[0].strip().replace(',', ''))
    return info


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

basic = pd.read_csv('data/final_data.csv')
basic = basic.dropna()

In [None]:
print(len(basic), basic.columns, len(basic.columns))

In [None]:
rows = []

for index, row in basic.iterrows():
    fetched = get_hotel(f"{row['name']} {row['city'].replace('_', '')}")
    if fetched and fetched != {}:
      rows.append(row.to_list() + list(fetched.values()))
    if index % 10 == 0:
      print(index, len(rows))