In [1]:
#
#  Fetching data from travelweekly to be used to help train model.
#
# >>> travelweekly.get_hotel(hotel)
#

from bs4 import BeautifulSoup
import requests
import json
import re

def get_hotel(hotel_name):
    try:
      url = 'https://www.travelweekly.com/AutoComplete.asmx/GetCompleteList'
      headers = {
          'content-type': 'application/json; charset=UTF-8',
          'origin': 'https://www.travelweekly.com',
          'referer': 'https://www.travelweekly.com/Hotels/Destinations,United-States',
          'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
      }
      data = json.dumps({"Request": {"Term": hotel_name.split(', an')[0], "Context": "HOT"}}).encode('utf-8')
      response = requests.post(url, headers=headers, data=data)
      for res in response.json()['d']:
          if res['Type'] == 'HOT' and res['URL'] != None:
              r = requests.get(f"https://www.travelweekly.com{res['URL']}", headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'})
              return extract_hotel_info(r.text)
      return {}
    except Exception as e:
      # print(e)
      return {}

def extract_hotel_info(html):
    soup = BeautifulSoup(html, 'html.parser')
    info = {'travelweekly_year_built': pd.NA, 'travelweekly_num_floors': pd.NA, 'travelweekly_num_rooms': pd.NA, 'travelweekly_chain': '', 'travelweekly_events_num_rooms': pd.NA, 'travelweekly_events_total_sqft': pd.NA, 'travelweekly_min_rate': pd.NA, 'travelweekly_max_rate': pd.NA}
    # 'travelweekly_year_renovated': ',
    # elif 'Year Last Renovated:' in text:
        # info['travelweekly_year_renovated'] = int(text.split('Year Last Renovated:')[1].strip().replace(',', ''))
    rooms_div = soup.find('p', class_='hotel-rooms')
    if rooms_div:
        info['travelweekly_num_rooms'] = int(rooms_div.get_text(strip=True).split('Rooms:')[1].strip().replace(',', ''))
    rates_div = soup.find('p', class_='hotel-rates')
    if rates_div:
        info['travelweekly_min_rate'] = int(rates_div.get_text(strip=True).split('Rates:')[1].split('-')[0].strip().replace(',', '').replace('$', ''))
        info['travelweekly_max_rate'] = int(rates_div.get_text(strip=True).split('Rates:')[1].split('-')[1].strip().replace(',', '').replace('$', ''))
    details_list = soup.find('div', class_='hotel-information-details')
    if details_list:
        for li in details_list.find_all('p'):
            text = li.get_text(strip=True)
            if 'Year Built:' in text:
                info['travelweekly_year_built'] = int(text.split('Year Built:')[1].strip().replace(',', ''))
            elif 'Number of Floors:' in text:
                info['travelweekly_num_floors'] = int(text.split('Number of Floors:')[1].strip().replace(',', ''))
            elif 'Chain:' in text:
                info['travelweekly_chain'] = text.split('Chain:')[1].strip()
    events_list = soup.find('div', class_='event-space row')
    if events_list:
        for p in events_list.find_all('p'):
            text = p.get_text(strip=True)
            if 'Total number of meeting rooms:' in text:
                info['travelweekly_events_num_rooms'] = int(text.split('Total number of meeting rooms:')[1].strip().replace(',', ''))
            if 'Total event space:' in text:
                info['travelweekly_events_total_sqft'] = int(text.split('Total event space:')[1].split('sq')[0].strip().replace(',', ''))
    return info


In [4]:
import json
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

basic = pd.read_csv('data/final_data.csv')
basic = basic.dropna()

In [5]:
print(len(basic), basic.columns, len(basic.columns))

2987 Index(['name', 'city', 'Width', 'Height', 'Brightness', 'Color', 'Dominate',
       'description', 'stars', 'price', 'rating', 'reviews', 'image', 'images',
       'categoryReviews', 'userReviews', 'staff', 'facilities', 'cleanliness',
       'comfort', 'valueForMoney', 'location', 'albuquerque', 'austin',
       'baltimore', 'bonston', 'calgary', 'charlotte', 'chicago', 'columbus',
       'dallas', 'denver', 'detroit', 'el_paso', 'fort_worth', 'fresno',
       'houston', 'indianapolis', 'jacksonville', 'kansas', 'las_vegas',
       'los_angeles', 'louiseville', 'memphis', 'mesa', 'milwaukee',
       'montreal', 'nashville', 'new_york', 'oklahoma_city', 'orlando',
       'philadelphia', 'phoenix', 'portland', 'sacramento', 'san_antonio',
       'san_diego', 'san_francisco', 'san_jose', 'seattle', 'toronto',
       'tucson', 'washington_dc', 'c1', 'c2', 'c3', 'd1', 'd2', 'd3'],
      dtype='object') 69


In [7]:
rows = []

last_city = ''
hotels_data = []

for index, row in basic.iterrows():
    tw_fetched = get_hotel(f"{row['name']} {row['city'].replace('_', '')}")
    more_booking = []

    if tw_fetched and tw_fetched != {}:
      if row['city'] != last_city:
         last_city = row['city']
         with open(f"data/listings/{row['city']}_hotels.json", 'r') as file:
          hotels_data = json.load(file)

      for hotel in hotels_data:
        if hotel['name'] == row['name']:
          more_booking.append(len(hotel['rooms']))
          sqft = [int(f.split('feet')[0]) for f in hotel['rooms'][0]['features'] if 'feet²' in f]
          if sqft != []:
            more_booking.append(sqft[0])
          else:
            more_booking.append(pd.NA)
          break

      rows.append(row.to_list() + list(tw_fetched.values()) + more_booking)
    if index % 10 == 0:
      print(index, len(rows))

0 1
10 8
20 15
30 22
40 26
50 29
60 31
70 36
80 40
90 42
100 44
110 50
120 52
130 55
140 55
150 57
160 62
170 64
180 67
190 68
200 71
210 73
220 77
230 81
240 84
250 84
260 88
270 88
280 88
290 88
300 88
310 88
320 88
330 88
340 89
350 89
360 89
370 89
380 89
390 89
400 89
410 89
420 89
430 89
440 89
450 89
460 90
470 90
480 90
490 91
500 91
510 92
520 93
530 99
540 102
550 106
560 112
570 115
580 115
590 115
600 115
610 115
620 115
630 115
640 116
650 116
660 116
670 120
680 123
690 124
700 128
710 131
720 135
730 138
740 144
750 151
760 153
770 156
780 160
790 164
800 167
810 173
820 175
830 180
840 183
850 188
860 194
870 198
880 202
890 203
900 204
910 207
920 210
930 211
940 212
950 213
960 213
970 213
980 217
990 222
1000 226
1010 228
1020 231
1030 232
1040 232
1050 232
1060 232
1070 232
1080 232
1090 232
1100 233
1110 233
1120 234
1130 234
1140 235
1150 235
1160 235
1170 235
1180 235
1190 235
1200 235
1210 237
1220 237
1230 237
1240 240
1250 243
1260 248
1270 250
1280 253
1290 2

In [8]:
df = pd.DataFrame(rows, columns=['name', 'city', 'Width', 'Height', 'Brightness', 'Color', 'Dominate',
       'description', 'stars', 'price', 'rating', 'reviews', 'image', 'images',
       'categoryReviews', 'userReviews', 'staff', 'facilities', 'cleanliness',
       'comfort', 'valueForMoney', 'location', 'albuquerque', 'austin',
       'baltimore', 'bonston', 'calgary', 'charlotte', 'chicago', 'columbus',
       'dallas', 'denver', 'detroit', 'el_paso', 'fort_worth', 'fresno',
       'houston', 'indianapolis', 'jacksonville', 'kansas', 'las_vegas',
       'los_angeles', 'louiseville', 'memphis', 'mesa', 'milwaukee',
       'montreal', 'nashville', 'new_york', 'oklahoma_city', 'orlando',
       'philadelphia', 'phoenix', 'portland', 'sacramento', 'san_antonio',
       'san_diego', 'san_francisco', 'san_jose', 'seattle', 'toronto',
       'tucson', 'washington_dc', 'c1', 'c2', 'c3', 'd1', 'd2', 'd3',
       'travelweekly_year_built', 'travelweekly_num_floors', 'travelweekly_num_rooms', 'travelweekly_chain', 'travelweekly_events_num_rooms', 'travelweekly_events_total_sqft', 'travelweekly_min_rate', 'travelweekly_max_rate',
       'num_room_types', 'standard_room_sqft'
])
# .select_dtypes(['number'])
# df = df.dropna()
df.to_csv('custom_dataset.csv')