In [43]:
import requests

from bs4 import BeautifulSoup

In [44]:
def get_list_of_places():
    # all_places_url = "https://globustut.by/area_pl_all.htm"
    # result = requests.get(all_places_url)
    # content = result.content
    # content = content.decode('Windows-1251')

    # Read already exported HTML instead
    with open('Вся Беларусь.html', 'r', encoding='Windows-1251') as f:
        content = f.read()

    soup = BeautifulSoup(content, 'html.parser')
    data_table = soup.find('html').find_all('table')[2]
    rows = list(data_table.find_all('tr'))[4:-1]

    places = []
    for row in rows:
        columns = list(row.find_all('td'))
        COLS_PER_PLACE = 4
        PLACES_PER_ROW = 4

        for place_id in range(PLACES_PER_ROW):
            place_cols = list(columns[place_id * COLS_PER_PLACE: place_id * COLS_PER_PLACE + COLS_PER_PLACE])
            place_title = place_cols[0]
            photo_count = place_cols[1].text
            place_rating = place_cols[2].find('img')['alt']
            place_url = place_title.find('a')['href'].replace('index.htm', '')
            name = place_title.text.strip()
            places.append({ "name": name, "url": place_url, "rating": place_rating, "photo_count": photo_count})
    return places

In [39]:
def get_place_info(place_url):
    result = requests.get(place_url + "index.htm")
    place_content = result.content
    place_content = place_content.decode('Windows-1251')

    soup = BeautifulSoup(place_content, 'html5lib')
    table = soup.body.find('table')

    coords_link = next((el for el in table.find_all('a') if "alt" in el.attrs and "(на «подвижной» карте «Глобуса Беларуси»)" in el['alt']), None)
    if coords_link:
        coords_text = coords_link.text
        coords = [c.strip() for c in coords_text.split(',')]
    else:
        coords = None

    place_name = table.find('font', {'class': 'hPage'}).text
    location_div = min([div for div in table.find_all('div') if place_name in div.text], key=lambda d: len(d.text))
    location_text = location_div.text.strip()[len(place_name):]

    sights = []
    sights_divs = [div for div in soup.body.find_all('div') if 'id' in div.attrs and div['id'] != 'GATag_WholeObj_index' and 'GATag_WholeObj' in div['id']]
    for sight_div in sights_divs:

        sight_id = sight_div['id'].replace('GATag_WholeObj_', '')

        main_image = next(image for image in sight_div.find_all('span') if 'id' in image.attrs and image['id'] == f"GATag_MainImgs_{sight_id}")
        image_element = main_image.find('img')
        if image_element is not None:
            src = image_element['src']
            if not src.startswith('http'):
                image_src = place_url + image_element['src']
            else:
                image_src = src
        else:
            image_src = None

        sight_info_table = sight_div.find('table', recursive=False)
        sight_name_element = sight_info_table.find('font', { "class": "subhPage"})
        if not sight_name_element:
            sight_name_element = sight_info_table.find('font', { "class": "subhPageStrach"})
        sight_name = sight_name_element.text.strip()

        title_table, *info_tables = sight_info_table.find_all('table')

        props = []
        for info_table in info_tables:
            lines = info_table.text.split('\n')
            for line in lines:
                line = line.strip().replace('\xa0', ' ')
                if ':' not in line:
                    continue
                prop, name = line.split(':')
                props.append((prop.strip(), name.strip()))

        sights.append({ "name": sight_name, "image": image_src, "props": props })
    return {
        "coords": coords,
        "location": location_text,
        "sights": sights,
        "name": place_name
    }

In [2]:
with open('places.json', 'r', encoding='utf-8') as f:
    import json
    places = json.load(f)

In [40]:
for place in places:
    if 'info' in place:
        continue
    print(place['url'])
    try:
        place_info = get_place_info(place['url'])
    except Exception as e:
        print(e)
        continue
    place['info'] = place_info

https://globustut.by/ilya/
https://globustut.by/star_sverz/
https://globustut.by/staroboris/
https://globustut.by/strune/
https://globustut.by/volchkovic/
https://globustut.by/koptevka/
https://globustut.by/gajna/
https://globustut.by/gonchary_lid/
https://globustut.by/gorodische_min/
https://globustut.by/lebedevo/
https://globustut.by/yazyl/
https://globustut.by/dovbuchki/
https://globustut.by/dostoevo/


In [42]:
with open('places.json', 'w', encoding='utf-8') as f:
    import json
    json.dump(places, f, indent=4, ensure_ascii=False)