In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
from lxml import html
import time
from collections import defaultdict
import random
import itertools
import json
from scipy.stats import ttest_ind

In [2]:
def parse_prices(request):
    tree = html.fromstring(request.content)
    price_elements = tree.xpath('//div[@class="price"]')
    prices = [re.findall(r'\$([0-9]+[\.,0-9]*)', p.text_content())[-1] for p in price_elements]
    return list(map(int, prices))

def requests_retry_session():
    session = requests.Session()
    retry = Retry(
        total=10,
        read=10,
        connect=10,
        backoff_factor=2,
        status_forcelist=(500, 502, 504),
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    return session

def get_prices(language):
    url = 'https://ca.hotels.com/search.do?resolved-location=CITY%3A1636865%3AUNKNOWN%3AUNKNOWN&destination-id=1636865&q-destination=Toronto,%20Ontario,%20Canada&q-check-in=2020-12-20&q-check-out=2021-01-09&q-rooms=1&q-room-0-adults=2&q-room-0-children=0'
    headers = {
        'authority': 'ca.hotels.com',
        'cache-control': 'max-age=0',
        'upgrade-insecure-requests': '1',
        'user-agent': f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.{str(round(1000*random.random(), 4))} Safari/537.36",
        'sec-fetch-dest': 'document',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'referer': 'https://ca.hotels.com/',
        'accept-language': f"{language},{language[0:2]};q=0.9",
    }

    request = requests_retry_session().get(url=url, headers=headers)
    return parse_prices(request)

def randomise_order(l):
    return sorted(l, key=lambda k: random.random())

def random_sleep():
    time.sleep(2 + (random.random() * 4))

In [3]:
languages = ['en-CA', 'fr-CA']
prices = defaultdict(list)

In [4]:
for i in range(5000):
    for language in randomise_order(languages):
        prices[language].append(get_prices(language))
        random_sleep()
    print("{:.2f}%".format(i/50), end="\r", flush=True)

99.98%

In [5]:
flat_prices = {k: list(itertools.chain(*v)) for k, v in prices.items()}
average_prices = {k: round(sum(v)/len(v), 2) for k, v in flat_prices.items()}

In [6]:
print(average_prices)

{'en-CA': 141.82, 'fr-CA': 141.7}


In [7]:
stat, p = ttest_ind(flat_prices['en-CA'], flat_prices['fr-CA'])
print('Statistics=%.3f, p=%.3f' % (stat, p))

Statistics=0.235, p=0.815


In [11]:
print(json.dumps(prices, indent=2))

{
  "en-CA": [
    [
      288,
      220,
      101,
      66,
      98,
      139,
      161,
      116,
      111,
      66,
      169,
      143
    ],
    [
      288,
      220,
      101,
      66,
      98,
      139,
      161,
      116,
      111,
      66,
      169,
      143
    ],
    [
      288,
      220,
      101,
      66,
      98,
      139,
      161,
      116,
      111,
      66,
      169,
      143
    ],
    [
      98,
      288,
      220,
      66,
      66,
      139,
      161,
      94,
      129,
      116,
      101,
      111
    ],
    [
      288,
      220,
      101,
      111,
      66,
      139,
      161,
      59,
      143,
      98,
      118,
      116
    ],
    [
      288,
      220,
      101,
      111,
      66,
      139,
      161,
      143,
      59,
      98,
      118,
      116
    ],
    [
      288,
      220,
      101,
      111,
      66,
      139,
      161,
      59,
      143,
      98,
      118,
      94
    ],
