In [188]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


similar("Субботники", "Суботники")

0.9473684210526315

In [189]:
from math import sin, cos, sqrt, atan2, radians

def distance_between_coordinates(c1: tuple[float, float], c2:tuple[float, float]) -> float:
    R = 6373.0

    lat1 = radians(c1[0])
    lon1 = radians(c1[1])
    lat2 = radians(c2[0])
    lon2 = radians(c2[1])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

distance_between_coordinates((53.9, 27.5667), (53.9, 28.5667))

65.53566951743105

In [190]:
import json

globus_data = json.load(open('globus_processed.json', encoding='utf-8'))
vedaj_data = json.load(open('vedaj_processed.json', encoding='utf-8'))
bestbelarus_data = json.load(open('bestbelarus_processed.json', encoding='utf-8'))
gotobelarus_data = json.load(open('gotobelarus_processed.json', encoding='utf-8'))
tropinki_data = json.load(open('tropinki_processed.json', encoding='utf-8'))

In [191]:
len(globus_data), len(vedaj_data), len(bestbelarus_data), len(gotobelarus_data), len(tropinki_data)

(3554, 349, 48, 268, 85)

## Globus

In [192]:
globus_data[0]

{'name': 'Абрамовщина 3-я',
 'location': ['Гродненская область', 'Сморгонский район'],
 'coords': [54.60153, 26.53344],
 'sights': [{'name': 'оборонительные сооружения 1-й мировой войны',
   'image': 'https://orda.of.by/.ga/a/abramovsch/oboron_main/nf/abramovschina_3-ya-kuhnya_1mir-06.05.06-d-179.jpg',
   'rating': 1,
   'coordinates': [54.605754, 26.531853],
   'tags': ['1-я мировая война']}],
 'url': {'href': 'https://globustut.by/abramovsch/index.htm',
  'text': 'Абрамовщина 3-я'},
 'rating': 1,
 'type': None}

In [193]:
database = []
for place in globus_data:
    place_tags = []
    for sight in place['sights']:
        sight_tags = sight['tags'] if 'tags' in sight else []
        place_tags += sight_tags
    place_tags = list(set(place_tags))
    place['url']['source'] = 'globus'    
    place = {
        'name': place['name'],
        'other_names': [],
        'description': None,
        'location': tuple(place['location']),
        'type': place['type'],
        'coordinates': tuple(place['coords']) if 'coords' in place and place['coords'] is not None else None,
        'images': [],
        'tags': place_tags,
        'urls': [
            place['url']
        ],
        'ratings': [
            { 'rating': place['rating'], 'source': 'globus' }
        ],
        'sights': [
            { 
                'name': sight['name'], 
                'description': None, 
                'location': None,
                'coordinates': tuple(sight['coordinates']) if 'coordinates' in sight else None,
                'tags': sight['tags'] if 'tags' in sight else [],
                'images': [
                    {
                        'url': sight['image'],
                        'source': 'globus'
                    }
                ],
                'ratings': [
                    { 'rating': sight['rating'], 'source': 'globus' }
                ]
            }
            for sight in place['sights']
        ]
    }

    database.append(place)

In [194]:
database[0]

{'name': 'Абрамовщина 3-я',
 'other_names': [],
 'description': None,
 'location': ('Гродненская область', 'Сморгонский район'),
 'type': None,
 'coordinates': (54.60153, 26.53344),
 'images': [],
 'tags': ['1-я мировая война'],
 'urls': [{'href': 'https://globustut.by/abramovsch/index.htm',
   'text': 'Абрамовщина 3-я',
   'source': 'globus'}],
 'ratings': [{'rating': 1, 'source': 'globus'}],
 'sights': [{'name': 'оборонительные сооружения 1-й мировой войны',
   'description': None,
   'location': None,
   'coordinates': (54.605754, 26.531853),
   'tags': ['1-я мировая война'],
   'images': [{'url': 'https://orda.of.by/.ga/a/abramovsch/oboron_main/nf/abramovschina_3-ya-kuhnya_1mir-06.05.06-d-179.jpg',
     'source': 'globus'}],
   'ratings': [{'rating': 1, 'source': 'globus'}]}]}

## Vedaj

In [195]:
vedaj_data[0]

{'name': 'Барановичи',
 'former_name': None,
 'location': ['Брестская область', 'Барановичский район'],
 'images': ['https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowiczy1.jpg',
  'https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowiczy13.jpg',
  'https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowiczy14.jpg',
  'https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowiczy15.jpg',
  'https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowiczy16.jpg',
  'https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowiczy2.jpg',
  'https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowiczy3.jpg',
  'https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowiczy4.jpg',
  'https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowiczy5.jpg',
  'https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowiczy6.jpg',
  'https://vedaj.by/images/cities/bre/baranaviczy/Baranowiczy/Baranowicz

In [196]:
for vedaj_place in vedaj_data:
    vedaj_location = vedaj_place['location']
    place_candidates = [place for place in database if place['location'] == tuple(vedaj_location[:2])]
    place = next((place for place in place_candidates if place['name'] == vedaj_place['name']), None)
    if not place and len(place_candidates) > 0:
        # If not found by exact name, reason can be:
        # 1. Either slightly different name
        matches = []
        for candidate in place_candidates:
            match = similar(candidate['name'], vedaj_place['name'])
            matches.append((candidate, match))
        matches.sort(key=lambda x: -x[1])
        best_match = matches[0]
        if best_match[1] > 0.7:
            place = best_match[0]
            place['other_names'].append(vedaj_place['name'])

    if not place:
        # 2. Or place from vedaj is actually a sight of some place
        last_part = vedaj_place['location'][-1]
        place_candidates = [place for place in database if last_part == place['name']]

        if len(place_candidates) == 0:
            print(f"Place {vedaj_place['name']} has no candidates")
            continue

        if len(place_candidates) > 1:
            print(f"Place {vedaj_place['name']} has multiple candidates")
            continue
        else:
            place = place_candidates[0]

        place_sights = place['sights']
        sight_matches = []
        for sight in place_sights:
            match = similar(sight['name'], vedaj_place['name'])
            sight_matches.append((sight, match))
        sight_matches.sort(key=lambda x: -x[1])
        best_sight_match = sight_matches[0]
        if best_sight_match[1] > 0.7:
            sight = best_sight_match[0]
            print(f"Updating sight {sight['name']} of {place['name']} with {vedaj_place['name']}")
            sight['tags'].extend(vedaj_place['tags'])
            sight['images'].extend([ {
                'url': url,
                'source': 'vedaj'
            } for url in vedaj_place['images']])
        else:
            print(f"Adding new sight to {place['name']}: {vedaj_place['name']}")
            place_sights.append({
                'name': vedaj_place['name'],
                'description': None,
                'location': None,
                'coordinates': vedaj_place['coordinates'] if 'coordinates' in vedaj_place else None,
                'tags': vedaj_place['tags'] if 'tags' in vedaj_place else [],
                'images': [{
                        'url': vedaj_place['images'][0],
                        'source': 'vedaj'
                    }
                ],
                'ratings': [
                ]
            })
    else:
        place['tags'].extend(vedaj_place['tags'])
        place['images'].extend([
            {
                'url': url,
                'source': 'vedaj'
            } 
            for url in vedaj_place['images']
        ])
        vedaj_place['url']['source'] = 'vedaj'
        place['urls'].append(vedaj_place['url'])

Adding new sight to Минск: Лошицкий усадебно-парковый комплекс в Минске
Adding new sight to Минск: Площадь Победы
Adding new sight to Минск: Костел Святой Троицы в Минске
Adding new sight to Минск: Центральный детский парк в Минске
Adding new sight to Минск: Дом-музей I съезда РСДРП
Adding new sight to Минск: Парк имени 900-летия города Минска
Adding new sight to Минск: Минский зоопарк
Adding new sight to Минск: Чижовка-Арена
Adding new sight to Гродно: Фарный костел Святого Франциска Ксаверия в Гродно
Adding new sight to Гродно: Костел Обретения Святого Креста и монастырь бернардинцев в Гродно
Adding new sight to Будслав: Будславский фест


## Gotobelarus

In [197]:
gotobelarus_data[0]

{'name': 'Минск',
 'description': '',
 'coords': [53.8966, 27.5504],
 'type': 'place',
 'images': ['https://34travel.me/media/upload/images/MAP/Belarus/minsk_05.jpg',
  'https://34travel.me/media/upload/images/MAP/Belarus/minsk_03.jpg'],
 'urls': [{'href': 'https://34travel.me/gotobelarus/post/mensk',
   'text': 'Минск'},
  {'href': 'https://34travel.me/gotobelarus/post/marshrut-po-evreyskomu-minsku',
   'text': 'История, воплощенная в архитектуре. Маршрут по еврейскому Минску'},
  {'href': 'https://34travel.me/gotobelarus/post/34-dostoprimechatelnosti-minska',
   'text': '34 достопримечательности Минска'},
  {'href': 'https://34travel.me/gotobelarus/post/minsk-audiogid',
   'text': '«Минск для новичков»: аудиогид'}],
 'rating': 4}

In [198]:
for gotobelarus_place in gotobelarus_data:
    if gotobelarus_place['type'] != 'place':
        continue
    if len(gotobelarus_place['urls']) == 0:
        continue
    place_candidates = [
        p for p in database 
        if p['coordinates'] is not None and distance_between_coordinates(p['coordinates'], gotobelarus_place['coords']) < 50
    ]
    place = next((place for place in place_candidates if place['name'] == gotobelarus_place['name'] or gotobelarus_place['name'] in place['other_names']), None)
    if not place:
        place_with_highest_similarity = max(
            place_candidates,
            key=lambda p: similar(p['name'], gotobelarus_place['name'])
        )

        similarity = similar(place_with_highest_similarity['name'], gotobelarus_place['name'])
        if similarity > 0.7:
            place = place_with_highest_similarity
            place['other_names'].append(gotobelarus_place['name'])
    
    if place:
        if gotobelarus_place['description']:
            place['description'] = gotobelarus_place['description']
        place['images'].extend([ {
            'url': url,
            'source': 'gotobelarus'
        } for url in gotobelarus_place['images']])
        place['urls'].extend([{ 'href': url['href'], 'text': url['text'], 'source': 'gotobelarus' } for url in gotobelarus_place['urls']])
        place['ratings'].append({ 'rating': gotobelarus_place['rating'], 'source': 'gotobelarus' })
    else:
        print(f"Adding new place {gotobelarus_place['name']}")
        new_place = {
            'name': gotobelarus_place['name'],
            'other_names': [],
            'description': '',
            'location': (),
            'place_type': '',
            'coordinates': gotobelarus_place['coords'],
            'images': [ {
                'url': url,
                'source': 'gotobelarus'
            } for url in gotobelarus_place['images']],
            'tags': [],
            'urls': [
               [{ 'href': url['href'], 'text': url['text'], 'source': 'gotobelarus' } for url in gotobelarus_place['urls']]
            ],
            'ratings': [
                { 'rating': gotobelarus_place['rating'], 'source': 'gotobelarus' }
            ],
            'sights': []
        }

Adding new place Минское городище
Adding new place Набережная
Adding new place Каптаруны


## Bestbelarus

In [199]:
bestbelarus_data[0]

{'name': '«Линия Сталина» историко-культурный комплекс',
 'coords': [54.05774, 27.291237],
 'location': ['Минская обл.', 'Минский р-н'],
 'address': [],
 'town': 'Лошаны',
 'town_type': 'аг.',
 'image': 'https://bestbelarus.by/upload/resize_cache/iblock/8b7/358_228_2/8b7e4299c6735d9660bfba6c66472616.jpg',
 'url': {'href': 'https://bestbelarus.by/objects/all/liniya-stalina-istoriko-kulturnyy-kompleks/',
  'text': '«Линия Сталина» историко-культурный комплекс'},
 'sights': [{'name': 'Территория',
   'images': ['https://bestbelarus.by/upload/resize_cache/iblock/b21/0_510_2a18e1f56f4eaac9195ab4f8530310a0f/b21cf3ae179fa09bee35abc7c1bf569a.jpg',
    'https://bestbelarus.by/upload/resize_cache/iblock/b21/0_510_2a18e1f56f4eaac9195ab4f8530310a0f/b21cf3ae179fa09bee35abc7c1bf569a.jpg',
    'https://bestbelarus.by/upload/resize_cache/iblock/b5d/0_510_2a18e1f56f4eaac9195ab4f8530310a0f/b5d5c50e6d35f6a310db52f4c3dfeb69.jpg',
    'https://bestbelarus.by/upload/resize_cache/iblock/b5d/0_510_2a18e1f56f4

In [200]:
for bestbelarus_place in bestbelarus_data:
    if bestbelarus_place['coords'] is None:
        continue
    place_candidates = [
        p for p in database 
        if p['coordinates'] is not None and distance_between_coordinates(p['coordinates'], bestbelarus_place['coords']) < 50
    ]

    is_sight = False
    place = next((
        place for place in place_candidates if place['name'].lower() == bestbelarus_place['name'].lower() or bestbelarus_place['name'] in place['other_names']
    ), None)

    if place is None:
        place = next((
            place for place in place_candidates if bestbelarus_place['town'] and bestbelarus_place['town'].lower() == place['name'].lower()
        ), None)
        if place is None:
            print("Place not found: ", bestbelarus_place['name'], bestbelarus_place['location'])
            continue
        else:
            is_sight = True

    bestbelarus_place['url']['source'] = 'bestbelarus'
    place['urls'].append(bestbelarus_place['url'])

    for sight in bestbelarus_place['sights']:
        sight_with_highest_similarity = max(
            place['sights'],
            key=lambda p: similar(p['name'], sight['name'])
        ) if len(place['sights']) > 0 else None

        if sight_with_highest_similarity is not None and similar(sight_with_highest_similarity['name'],  sight['name']) > 0.95:
            existing_sight = sight_with_highest_similarity
            existing_sight['images'].extend([{
                'url': image, 'source': 'bestbelarus'
            } for image in sight['images']])
        else:
            place['sights'].append({
                'name': sight['name'],
                'description': '',
                'location': None,
                'coordinates': sight['coordinates'] if 'coordinates' in sight else None,
                'tags': sight['tags'] if 'tags' in sight else [],
                'images':[{
                'url': image, 'source': 'bestbelarus'
            } for image in sight['images']],
                    'ratings': [
                    ]
                })

Place not found:  Национальный парк «Припятский» ['Гомельская обл.', 'Петриковский р-н']


## Tropinki

In [201]:
tropinki_data[8]

{'name': 'Туровский луг и станция кольцевания',
 'url': {'href': 'https://tropinki.by/gomelskaya-oblast/turovskij-lug-i-stanciya-kolcevaniya',
  'text': 'Туровский луг и станция кольцевания'},
 'image': 'https://tropinki.by/media/cache/cf/59/cf5966b8262e51ed7090a84211fd86da.webp',
 'location': ['Гомельская область',
  'Житковичский район',
  'г. Туров',
  'ул. Новая д. 48'],
 'coords': [52.072851, 27.752283]}

In [202]:
for tropinki_place in tropinki_data:
    location = tropinki_place['location']
    candidates = [place for place in database if place['coordinates'] and tropinki_place['coords'] and distance_between_coordinates(place['coordinates'], tropinki_place['coords']) < 50]

    matched_place = None
    is_nearby = False
    for place in candidates:
        name = place['name']
        for location_part in location:
            if name in location_part.split():
                matched_place = place
                break

    if not matched_place:
        closest_place = min(candidates, key=lambda p: distance_between_coordinates(p['coordinates'], tropinki_place['coords']))
        distance = distance_between_coordinates(closest_place['coordinates'], tropinki_place['coords'])

        if distance < 3:
            is_nearby = True
            matched_place = closest_place
        else:
            print(f"Place {tropinki_place['name']} has no candidates")
            print(f"Closest place is {closest_place['name']} with distance {distance}")
            continue

    tropinki_place['url']['source'] = 'tropinki'
    matched_place['urls'].append(tropinki_place['url'])

    if is_nearby:
        matched_place['sights'].append({
            'name': tropinki_place['name'],
            'description': '',
            'location': tropinki_place['location'],
            'coordinates': tropinki_place['coords'],
            'tags': [],
            'images': [{
                'url': tropinki_place['image'],
                'source': 'tropinki'
            }],
            'ratings': [
            ]
        })
    else:
        matched_place['images'].append({
            'url': tropinki_place['image'],
            'source': 'tropinki'
        })

Place Родник «Большой Болцикский» has no candidates
Closest place is Константиново with distance 4.069734463186039
Place Родники Погребищенские: природная жемчужина Логойского района has no candidates
Closest place is Юрковичи with distance 3.2387417305967956
Place Заказник «Выдрица» и экотуристическая база «Уречье» has no candidates
Closest place is Добровольща with distance 7.62420438537921
Place Экскурсия на козеферму «Дак» has no candidates
Closest place is Юцки with distance 4.442499370710324
Place Заказная стоянка Бали на озере Укля has no candidates
Closest place is Наволока with distance 4.30086442409236
Place Пойменные луга Припяти has no candidates
Closest place is Погост with distance 9.007335399613831
Place Наблюдаем за ревущими оленями в Национальном парке «Припятский» has no candidates
Closest place is Дорошевичи with distance 3.82183243618424
Place Козьянский заказник: космические ландшафты в Беларуси has no candidates
Closest place is Оболь with distance 9.1955817411333

In [203]:
import json 

with open('database.json', 'w', encoding='utf-8') as f:
    json.dump(database, f, ensure_ascii=False, indent=2)

In [204]:
compressed_database = []
places_removed = 0
place_id = 0
for place in database:
    if len(place['other_names']) == 0:
        del place['other_names']
    if len(place['tags']) == 0:
        del place['tags']
    if len(place['urls']) == 0:
        del place['urls']
    if len(place['sights']) == 0:
        del place['sights']
    if len(place['images']) == 0:
        del place['images']
    if len(place['ratings']) == 0:
        del place['ratings']
    
    if place['description'] == '' or place['description'] is None:
        del place['description']

    if 'type' in place and (place['type'] == '' or place['type'] is None):
        del place['type']

    if ('sights' not in place or len(place['sights']) == 0) and len(place['urls']) == 1 and place['urls'][0]['source'] == 'globus':
        places_removed += 1
        print(f"Removing place {place['name']}")
        continue

    if 'sights' in place:
        for sight in place['sights']:
            if sight['images'] is not None:
                sight['images'] = [
                    image for image in sight['images'] if image['url'] is not None
                ]
            if len(sight['tags']) == 0:
                del sight['tags']
            if len(sight['images']) == 0:
                del sight['images']
            if len(sight['ratings']) == 0:
                del sight['ratings']
            
            if sight['description'] == '' or sight['description'] is None:
                del sight['description']

            if sight['location'] == '' or sight['location'] is None:
                del sight['location']

            if sight['coordinates'] == '' or sight['coordinates'] is None:
                del sight['coordinates']

        if len(place['sights']) == 0:
            del place['sights']
    place["id"] = place_id
    compressed_database.append(place)
    place_id += 1

print(f"Removed {places_removed} places")

with open('../../UI/static/data/places.json', 'w', encoding='utf-8') as f:
    json.dump(compressed_database, f, ensure_ascii=False)

Removing place Адамарин
Removing place Адамово
Removing place Аксеньковичи
Removing place Александрина
Removing place Александрия
Removing place Андроново
Removing place Бахотец
Removing place Белая
Removing place Белая Липа
Removing place Белая Лужа
Removing place Бельчица
Removing place Бервищи
Removing place Бердовщина
Removing place Березвечье
Removing place Березовка
Removing place Блошники
Removing place Бобы
Removing place Божий Дар
Removing place Бозок
Removing place Бол. Воля
Removing place Бол. Изва
Removing place Болонов Селец
Removing place Борково
Removing place Боровляны
Removing place Бояры
Removing place Бричицы
Removing place Будревичи
Removing place Буй
Removing place Буйки
Removing place Бутишки
Removing place Бутьки
Removing place Велец
Removing place Веславово
Removing place Видлин
Removing place Вильчицы
Removing place Вишенька
Removing place Войкалы
Removing place Волчковичи
Removing place Воробьевичи
Removing place Вороничи
Removing place Вошковцы
Removing place