In [1]:
import json
from bs4 import BeautifulSoup
import os
import re
from hugchat import hugchat
from hugchat.login import Login

segittur_2022 = "../Catalogs/Segittur/catalog_segittur_2022.json"
segittur_2023 = "../Catalogs/Segittur/catalog_segittur_2023.json"
adestic_v1 = "../Catalogs/Adestic/catalogo_soluciones_turisticas_old.json"
adestic_v2 = "../Catalogs/Adestic/catalogo_soluciones_turisticas.json"


In [15]:
# It will creates a json list where each element is a json object with title, creator and solution type
def extract_info_to_analyze_repetitions(file, isAdestic=False):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)

        solutions = {}

        for company in data['companies']:
            if re.search('<.*?>', company):
                soup = BeautifulSoup(company, 'html.parser')
                # print(json.dumps(company, indent=4))
                company_name = soup.h2.text
            else:
                company_name = company

            for solution in data['companies'][company]['products']:
                if re.search('<.*?>', solution['product_name']):
                    soup = BeautifulSoup(solution['product_name'], 'html.parser')
                    solution_name = soup.h1.text
                else:
                    solution_name = solution['product_name']
                    
                if re.search('<.*?>', solution['product_description']):
                    solution_description = re.sub('<.*?>', '', solution['product_description'])

                if isAdestic:
                    solution_name = solution_name.split('(Original name (Spanish)')[0].strip()

                if type(solution['solType']) is list:
                    sol_types = solution['solType']
                else:
                    soup = BeautifulSoup(solution['solType'], 'html.parser')
                    sol_types = [li.text for li in soup.find_all('li')]
                    
                page_number = solution['page']

                solutions[solution_name] = {
                    "company": company_name.strip(),
                    "product_description": solution_description,
                    "page": page_number,
                    "solType": sol_types
                }
    with open(f"Small Versions/Small_Version_{os.path.basename(file)}", "w", encoding='utf-8') as f:
        f.write(json.dumps(solutions, indent=4))

In [16]:
extract_info_to_analyze_repetitions(adestic_v1, True)
extract_info_to_analyze_repetitions(adestic_v2, True)
extract_info_to_analyze_repetitions(segittur_2022)
extract_info_to_analyze_repetitions(segittur_2023)

In [21]:
small_segittur_2022 = "Small Versions/Small_Version_catalog_segittur_2022.json"
small_segittur_2023 = "Small Versions/Small_Version_catalog_segittur_2023.json"
small_adestic_v1 = "Small Versions/Small_Version_catalogo_soluciones_turisticas_old.json"
small_adestic_v2 = "Small Versions/Small_Version_catalogo_soluciones_turisticas.json"

In [17]:
# Correcting the Solutions Types in the small versions

def correct_sol_types(catalog, incorrect_types_file):
    with open(catalog, 'r', encoding='utf-8') as f:
        data_catalog = json.load(f)
        
        with open(incorrect_types_file, 'r', encoding='utf-8') as i:
            incorrect_types = json.load(i)
            
            for sol in data_catalog.keys():
                fixed_sol_types = []
                for t in data_catalog[sol]['solType']:
                    if t in incorrect_types:
                        fixed_sol_types.append(incorrect_types[t])
                    else:
                        fixed_sol_types.append(t)
                data_catalog[sol]['solType'] = fixed_sol_types
                        
    with open(catalog, 'w', encoding='utf-8') as f:
        f.write(json.dumps(data_catalog, indent=4))
        
# correct_sol_types(small_segittur_2022, "Small Versions/Incorrect_Sol_Types_Segittur 2022.json")
# correct_sol_types(small_adestic_v1, "Small Versions/Incorrect_Sol_Types_Adestic V1.json")
# correct_sol_types(small_adestic_v2, "Small Versions/Incorrect_Sol_Types_Adestic V2.json")
    

In [17]:
def check_if_type_exists(catalog, sol_types, sol_type):
    for t in sol_types:
        if (type(t[catalog]) == list and sol_type in t[catalog]) or (type(t[catalog]) == str and sol_type == t[catalog]):
            return True
    return False

def check_for_incorrect_sol_types(catalog, solutions_file):
    with open(solutions_file, 'r', encoding='utf-8') as f:
        solutions = json.load(f)
    with open("Type_of_Solution_Association (EN).json", 'r', encoding='utf-8') as f:
        sol_types = json.load(f)
        
    incorrect_sol_types = []
        
    for sol in solutions.keys():
        for t in solutions[sol]['solType']:
            # print(f"{t}, {not check_if_type_exists(catalog, sol_types, t)}, {t not in incorrect_sol_types}")
            if not check_if_type_exists(catalog, sol_types, t) and t not in incorrect_sol_types:
                incorrect_sol_types.append(t)
    
    with open(f"Small Versions/Incorrect_Sol_Types_{catalog}.json", "w", encoding='utf-8') as f:
        # print(f"Writing...\n{json.dumps(incorrect_sol_types, indent=4)}")
        f.write(json.dumps(incorrect_sol_types, indent=4))
                

In [18]:
# check_for_incorrect_sol_types("Segittur 2022", small_segittur_2022)
# check_for_incorrect_sol_types("Segittur 2023", small_segittur_2023)
# check_for_incorrect_sol_types("Adestic V1", small_adestic_v1)
# check_for_incorrect_sol_types("Adestic V2", small_adestic_v2)

In [22]:
sol_segittur_2022 = json.load(open(small_segittur_2022, 'r', encoding='utf-8'))
sol_segittur_2023 = json.load(open(small_segittur_2023, 'r', encoding='utf-8'))
sol_adestic_v1 = json.load(open(small_adestic_v1, 'r', encoding='utf-8'))
sol_adestic_v2 = json.load(open(small_adestic_v2, 'r', encoding='utf-8'))

In [25]:
def compare_solution_by_name_and_company(solution, catalog_name, catalog_data, other_catalog_name, other_catalog_data, repeated_solutions):
    it_was_added = True
    # print(f"\n{solution} from {catalog_data[solution]['company']}, in {catalog_name}. \n Is in {other_catalog_name}: {solution in other_catalog_data}")
    # if solution in other_catalog_data:
    #     print(f"Is it the same company {other_catalog_data[solution]['company']}, {catalog_data[solution]['company'].strip() == other_catalog_data[solution]['company']}")
    if solution in other_catalog_data and catalog_data[solution]['company'].strip() == other_catalog_data[solution]['company'].strip():
        print(f"\n{solution} from {catalog_name} has the same name and company in {other_catalog_name}")
        
        if solution not in repeated_solutions:
            repeated_solutions[solution] = {
                'company': catalog_data[solution]['company'],
                catalog_name: catalog_data[solution]['page'],
                other_catalog_name: other_catalog_data[solution]['page']
            }
        # I think this condition is not necessary
        elif catalog_name not in repeated_solutions[solution]:
            repeated_solutions[solution][catalog_name] = catalog_data[solution]['page']
    else:
        it_was_added = False

    return repeated_solutions, it_was_added


"""
For example with this case, where I am iterating over Segittur 2022 and want the equivalent Sol types of Accessibility:
{
        "Segittur 2022": "Accessibility",
        "Segittur 2023": "",
        "Adestic V1": "Accessibility",
        "Adestic V2": "Accessibility"
}
I want to get all associations where there is type in Segittur 2023 and not in Segittur 2022 like this:
{
        "Segittur 2022": "",
        "Segittur 2023": "Intelligent Signage/Totems/Tourism Signage",
        "Adestic V1": "",
        "Adestic V2": ""
}.
With this I can get all the solutions that might be the same but are included in different categories since the categories are not all the same between catalogs.
"""
def get_not_common_solutions_types(catalog_name, other_catalog_name, types_association):
    not_common_types = []

    for association in types_association:
        if association[catalog_name] == "" and association[other_catalog_name] != "":
            not_common_types.append(association)
    return not_common_types

def get_possible_equivalent_solutions_types(sol_types, catalog_name, types_association):
    for sol_type in sol_types:
        
        for association in types_association:
            if (isinstance(association[catalog_name], list) and sol_type in association[catalog_name]) or sol_type == association[catalog_name]:
                for key, value in association.items():
                    if value == "":
                        association[key] = get_not_common_solutions_types(catalog_name, key, types_association)

                return association
    return {}


def get_solutions_from_the_same_company_and_equivalent_type(company, other_catalog_data, other_catalog_name, sol_types_association):
    # Verificar se tenho passar a solution toda ou apenas a company
    solutions = {}

    # Equivalent solution type of the other catalog from the Type_of_Solution_Association (EN).json file.
    sol_type_association = sol_types_association[other_catalog_name]
    is_list_of_str_association = isinstance(sol_type_association, list) and all(isinstance(item, str) for item in sol_type_association)
    is_list_of_dict_association = isinstance(sol_type_association, list) and all(isinstance(item, dict) for item in sol_type_association)

    for other_solution_key, other_solution_value in other_catalog_data.items():
        if other_solution_value['company'] == company:
            # Solution type of the iterated solution from the other catalog
            sol_type = other_solution_value['solType']
            is_list_sol_type = isinstance(sol_type, list)

            if is_list_of_str_association and is_list_sol_type and any(element in sol_type_association for element in sol_type):
                solutions[other_solution_key] = other_solution_value
            elif (is_list_of_str_association and sol_type in sol_type_association) or (is_list_sol_type and sol_type_association in sol_type):
                solutions[other_solution_key] = other_solution_value
            elif sol_type_association == sol_type:
                solutions[other_solution_key] = other_solution_value
            elif is_list_of_dict_association:
                for dict_association in sol_type_association:
                    if dict_association[other_catalog_name] == sol_type:
                        solutions[other_solution_key] = other_solution_value

    return solutions



def check_for_repeated_solutions(catalogs):
    # Catalogs and Incorrect_types_files must be in the same catalogs order
    repeated_solutions = {}

    types_association = json.load(open("Type_of_Solution_Association (EN).json", 'r', encoding='utf-8'))
    

    # Iterate over the catalogs
    for i, catalog in enumerate(catalogs):
        catalog_name, catalog_data = catalog
        # A list with the remaining catalogs
        others_catalogs = catalogs[i+1:]

        # Iterate over the solutions of the current catalog
        for solution in catalog_data:
            """
            Getting the solution types association for this specific solution. It will return a dictionary with 4 keys (one for each catalog) and the type(s) of the respective catalog. An example for a solution from the Segittur 2022 with Other Hardware / Software Solutions type.
            {
                "Segittur 2022": "Other Hardware / Software Solutions",
                "Segittur 2023": "Other solutions HW / SW",
                "Adestic V1": [
                    "Software",
                    "POS Software"
                ],
                "Adestic V2": "Software"
            }"""
            sol_types_association = get_possible_equivalent_solutions_types(catalog_data[solution]['solType'], catalog_name, types_association)
            

            # Iterate over the other catalogs to find repeated solutions  
            for other_catalog_name, other_catalog_data in others_catalogs:
                # Solutions from the same company and with the same name
                repeated_solutions, it_was_added = compare_solution_by_name_and_company(solution, catalog_name, catalog_data, other_catalog_name, other_catalog_data, repeated_solutions)

                # Move on to the next catalog if a repeated solution was already found in the current other catalog.
                if it_was_added:
                    # print(f"Solution: {solution}, present in {catalog_name} and {other_catalog_name}")
                    continue
                
                # Solutions from the same company and with compatible solution types. Then descriptions will be compared to see if they are the same.
                possible_repetitions = get_solutions_from_the_same_company_and_equivalent_type(catalog_data[solution]['company'], other_catalog_data, other_catalog_name, sol_types_association)
                
                if possible_repetitions != {}:
                    print(f"Possible repetitions from {other_catalog_name} of {solution}, with type {catalog_data[solution]['solType']} in {catalog_name}: {json.dumps(possible_repetitions, indent=4)}")
                    
                
                
        # print(f"Repetitions: {json.dumps(repeated_solutions, indent=4)}")
        break

In [17]:
"""
Devolver um ficheiro com todos os repetidos assim
"TOURISM CONSULTANCY SERVICES": {
        "2023": 312,
        "2022": 218
    }
1. Verificar nome igual entre catálogos
    1.1 Verificar se empresa é igual
2. Buscar soluções com o mesmo criador e com o mesmo tipo de solução de outros catálogos.
   OU
   Buscar soluções com o mesmo criador mas com um tipo de solução que não existe no catálogo atual.
3. Utilizar hugchat para comparar descrições
"""


'\nDevolver um ficheiro com todos os repetidos assim\n"TOURISM CONSULTANCY SERVICES": {\n        "2023": 312,\n        "2022": 218\n    }\n1. Verificar nome igual entre catálogos\n    1.1 Verificar se empresa é igual\n2. Buscar soluções com o mesmo criador e com o mesmo tipo de solução de outros catálogos.\n   OU\n   Buscar soluções com o mesmo criador mas com um tipo de solução que não existe no catálogo atual.\n3. Utilizar hugchat para comparar descrições\n'

In [11]:
cookie_path_dir = "../cookies"
login_path = "C:\\Users\Diogo Cosme\Documents\ISCTE\Tese\huggingFace login.json"
login_path = login_path.replace("\\", "/")
# with open(login_path, 'r') as f:
#     login_data = json.load(f)
#     EMAIL = login_data['email']
#     PASSWD = login_data['password']
#     
# sign = Login(EMAIL, PASSWD)
# cookies = sign.login()
# sign.saveCookiesToDir(cookie_path_dir)
cookies_dict = json.load(open(f"{cookie_path_dir}/dfmce@iscte-iul.pt.json", 'r'))

chatbot = hugchat.ChatBot(cookies=cookies_dict)
it_worked = chatbot.switch_llm(5)
print(f"Switched to llm: {it_worked}")
query_result = chatbot.chat("What are you capable of?")
print(query_result)

# conversation_list = chatbot.get_remote_conversations(replace_conversation_list=True)
# 
# models = chatbot.get_available_llm_models()
# 
# print(conversation_list)
# 
# print(models)

Switched to llm: True
 I can carry out a wide range of tasks and activities, such as:

* Answering questions: I can provide information on a variety of topics, from general knowledge to specific queries.
* Setting reminders and alarms: I can help you remember important dates and appointments by setting reminders or alarms for you.
* Sending messages and making calls: I can send texts or emails, and make phone calls on your behalf.
* Providing news and weather updates: I can keep you informed about the latest news and weather conditions.
* Playing music and videos: I can play your favorite songs and videos, and even recommend new ones based on your preferences.
* Managing your calendar: I can schedule appointments, meetings, and events for you, and send you reminders when they're coming up.
* Providing navigation and traffic information: I can help you get to where you need to go by providing turn-by-turn directions and real-time traffic updates.
* And much more! The capabilities of vir

In [26]:
check_for_repeated_solutions([
    ("Segittur 2022", sol_segittur_2022),
    ("Segittur 2023", sol_segittur_2023),
    ("Adestic V1", sol_adestic_v1),
    ("Adestic V2", sol_adestic_v2)
])


GESTDOC360 from Segittur 2022 has the same name and company in Segittur 2023
Possible repetitions from Segittur 2023 of BALANCING TOURIST CAPACITY  VIA BLOCKCHAIN, with type ['Mobility', 'Other Hardware / Software Solutions'] in Segittur 2022: {
    "BALANCING TOURIST CAPACITY VIA BLOCKCHAIN": {
        "company": "AIR INSTITUTE",
        "product_description": "AIR Institute, winner of the Tourism Sector Challenge set by Alastria/TDDS during the T2020 Transfiere Forum in collaboration with the Consell of Formentera, has come up with a solution for managing tourist capacity. The aim of AIR Institute's proposal is to balance tourist flows and to promote the sustainable use of natural resources. The solution is based on a decentralised system that encourages residents and tourists to visit the least-visited areas by using blockchain and smart contracts.\nThe system works by employing oracles (a service by which a blockchain or smart contract feeds on external data outside the blockchain