In [54]:
import re
import fitz

def check_stt_title_conditions_EU_2022(span_dict):
    if span_dict['font'] == "OpenSans-Bold" and span_dict['size'] == 10.5 and span_dict['color'] == 0 and not bool(re.search(r'\d\.\d',span_dict['text'])):
        return True
    return False

def check_stt_title_conditions_EU_2023(span_dict):
    if span_dict['font'] == "Calibri-Bold" and span_dict['size'] == 12.0 and span_dict['color'] == 0:
        return True
    return False

def check_stt_description_conditions_EU_2022(span_dict):
    if span_dict['font'] == "OpenSans" and (span_dict['size'] == 10.5 or span_dict['size'] == 10.0) and span_dict['color'] == 0:
        return True
    return False

def check_stt_description_conditions_EU_2023(span_dict):
    if span_dict['font'] == "Calibri" and (span_dict['size'] == 12.0 or span_dict['size'] == 10.0) and span_dict['color'] == 0:
        return True
    return False

def check_stt_url_conditions_EU_2022(span_dict):
    if span_dict['font'] == "OpenSans-Light" and span_dict['size'] == 10.5 and span_dict['color'] == 1393044:
        return True
    return False

def check_stt_url_conditions_EU_2023(span_dict):
    if span_dict['font'] == "Calibri" and span_dict['size'] == 12.0 and span_dict['color'] == 287681:
        return True
    return False


class STT:

    def __init__(self, title, bbox, initial_pdf_page):
        tmp = []
        if ";" in title:
            tmp = title.split(";")
        elif "," in title:
            tmp = title.split(",")
        self.name = tmp[0] if len(tmp) > 1 else title
        self.city = "" if len(tmp) <= 1 else tmp[-1].replace(":", "")[1:]
        self.description = ""
        self.urls = []
        self.images = []
        self.y = float(bbox[1])
        self.initial_pdf_page = initial_pdf_page
        self.final_pdf_page = initial_pdf_page

    def set_description(self, description, current_pdf_page):
        if self.description != "" and description[0] != " " and self.description[-1] == ".":
            self.description += "\n"
        self.description += description
        self.set_final_pdf_page(current_pdf_page)


    def set_url(self, url, current_pdf_page):
        # print(f"Set url for {stt.name}. Current page: {current_pdf_page}")
        if url == " ": return
        if url.startswith("http") or url.startswith("www"):
            self.urls.append(url)
        else:
            self.urls[-1] += url
        self.set_final_pdf_page(current_pdf_page)

    def set_image(self, image_data):
        self.images.append(image_data)
        
    def set_final_pdf_page(self, final_pdf_page):
        self.final_pdf_page = final_pdf_page

    def __str__(self):
        return f"\nSTT: {self.name}\nDescription: {self.description}\nURL: {self.urls}\nNumber of images: {len(self.images)}\nCity: {self.city}\nInitial page:{self.initial_pdf_page}\nFinal page:{self.final_pdf_page}"

stts_founded = []

def append_stt(span_dict, pag_number):
    stts_founded.append(STT(span_dict['text'], span_dict['bbox'], pag_number))
    
def remove_chars_from_city(city):
    return city.replace(":", "").replace(" ", "")


def span_is_title(span_dict, pag_number):
    # Caso o título esteja dividido em vários spans
    if len(stts_founded) > 0 and stts_founded[-1].description == "":
        # Resolver caso de Venice pag 12
        if (":" in span_dict['text'] and ";" in span_dict['text'] 
                and float(span_dict['bbox'][1]) - stts_founded[-1].y > 16.00):
            # A distância entre linhas consecutivas é +/- 15.00
            stts_founded.remove(stts_founded[-1])
            append_stt(span_dict, pag_number)
        # Irá entrar no ELIF caso o current span conter a cidade
        elif ":" in span_dict['text']:
            if ";" in span_dict['text']:
                tmp = span_dict['text'].split(";")
                stts_founded[-1].name += tmp[0]
                stts_founded[-1].city = remove_chars_from_city(tmp[1])
            elif "," in span_dict['text']:
                tmp = span_dict['text'].split(",")
                stts_founded[-1].name += tmp[0]
                stts_founded[-1].city = remove_chars_from_city(tmp[1])
            else:
                stts_founded[-1].name = stts_founded[-1].name.replace(";", "")
                stts_founded[-1].city = remove_chars_from_city(span_dict['text'])
        else: stts_founded[-1].name += span_dict['text']
    else: append_stt(span_dict, pag_number)
    
def analyze_span_EU_2022(span_dict, pag_number):
    print("span_dict: ", span_dict)
    if check_stt_title_conditions_EU_2022(span_dict):
        span_is_title(span_dict, pag_number)
        # Caso contenha descrição de uma STT que termine na primeira página deste PDF
    elif len(stts_founded) == 0:
        return
    elif check_stt_description_conditions_EU_2022(span_dict):
        # print(f"Description founded: {span['text']}")
        stts_founded[-1].set_description(span_dict['text'], pag_number)
    elif check_stt_url_conditions_EU_2022(span_dict):
        # print(f"URL founded: {span['text']}")
        stts_founded[-1].set_description(span_dict['text'], pag_number)
        stts_founded[-1].set_url(span_dict['text'], pag_number)
        
def analyze_span_EU_2023(span_dict, pag_number):
    # print("span_dict: ", span_dict)
    if span_dict['font'] == "Calibri" and span_dict['size'] != 12.0 and span_dict['color'] == 0:
        print(f"New description size: {span_dict['size']}, {span_dict['text']}")
    if check_stt_title_conditions_EU_2023(span_dict):
        span_is_title(span_dict, pag_number)
        # Caso contenha descrição de uma STT que termine na primeira página deste PDF
    elif len(stts_founded) == 0:
        return
    elif check_stt_description_conditions_EU_2023(span_dict):
        # print(f"Description founded: {span['text']}")
        stts_founded[-1].set_description(span_dict['text'], pag_number)
    elif check_stt_url_conditions_EU_2023(span_dict):
        # print(f"URL founded: {span['text']}")
        stts_founded[-1].set_description(span_dict['text'], pag_number)
        stts_founded[-1].set_url(span_dict['text'], pag_number)
        
def find_correspondent_stt_to_image(img_block, current_page_number):
    for stt_founded in reversed(stts_founded):
        if current_page_number == stt_founded.initial_pdf_page and stt_founded.y < img_block['bbox'][1]:
            return stt_founded
    """
    If doesn't find a correspondent STT in the current page, it means that the image belongs to the last stt_founded
    in the previous page
    """
    for stt_founded in reversed(stts_founded):
        if current_page_number - 1 == stt_founded.initial_pdf_page:
            return stt_founded



In [None]:
eu_2022 = "../Catalogues/Catalogue European.pdf"

stts_founded = []
with fitz.open(eu_2022) as doc:
    for page_number in range(7, 8):
        page = doc[page_number]
        for block in page.get_text('dict')['blocks']:
            # print(block)
            if "lines" in block.keys():
                for line in block['lines']:
                    for span in line['spans']:
                        analyze_span_EU_2022(span, page_number)
            elif block['type'] == 1:
                # Caso o block da image não apareca por ordem
                if len(stts_founded) > 0 and stts_founded[-1].y < block['bbox'][1]:
                    stts_founded[-1].set_image(block['image'])
                else:
                    print(f"Number of STT founded: {len(stts_founded)}")
                    stts_founded[-2].set_image(block['image'])

In [55]:
eu_2023 = "../Catalogues/2023-leading-practices-in-smart-tourism.pdf"
logo_to_ignore_EU2023 = open(f"logo_to_ignore.jpg", "rb").read()
eu_to_ignore_EU2023 = open(f"EU_logo_to_ignore.jpg", "rb").read()

stts_founded = []
with fitz.open(eu_2023) as doc:
    for page_number in range(7, 72):
        print(f"Page number: {page_number}")
        page = doc[page_number]
        for block in page.get_text('dict')['blocks']:
            # print(block)
            if "lines" in block.keys():
                for line in block['lines']:
                    for span in line['spans']:
                        try:
                            int(span['text'])
                        except ValueError:
                            analyze_span_EU_2023(span, page_number)
            elif block['type'] == 1:
                if block['image'] == logo_to_ignore_EU2023 or block['image'] == eu_to_ignore_EU2023:
                    # print("Ignored image")
                    continue
                # Caso o block da image não apareca por ordem
                else: find_correspondent_stt_to_image(block,page_number).set_image(block['image'])


Page number: 7
Page number: 8
New description size: 11.039999961853027, © City of Antalya 
New description size: 11.039999961853027, © City of Gdynia 
Page number: 9
New description size: 11.039999961853027, © City of Gdynia
New description size: 11.039999961853027, © City of Gijón 
Page number: 10
New description size: 11.039999961853027, © City of Pafos 
New description size: 11.039999961853027, © City of San Sebastián 
Page number: 11
New description size: 11.039999961853027, © Visit Cork 
Page number: 12
New description size: 11.039999961853027, © City of Essen 
Page number: 13
Page number: 14
New description size: 11.039999961853027, National Rugby Stadium, © City of Gdynia
Page number: 15
New description size: 9.119999885559082,  
New description size: 11.039999961853027, © Taranto Municipality 
New description size: 11.039999961853027,  
New description size: 9.119999885559082,  
New description size: 9.119999885559082,  
New description size: 9.119999885559082,  
Page number: 1

In [50]:
for stt in stts_founded:
    print(stt)


STT: Inclusive initiatives
Description: Accessibility is also about inclusion and Aarhus is an open and innovative city with a great diversity of people. Aarhus has a strong LGBTQ+ community and every year, the Aarhus Pride parade celebrates diversity in Aarhus and contributes to the respect for and acceptance of all LGBTQ+ persons. Also, the Aarhus City Council has a strong focus on making Aarhus a better and more attractive city and a better and more attractive workplace for LGBTQ+ persons. As the first city in Denmark, Aarhus in 2019 joined the Rainbow Cities Network, including 33 cities in 17 different countries, actively working to improve conditions for LGBTQ+ citizens. Aarhus also hosts ‘KØN – Gender Museum Denmark’ presenting the cultural history between genders, equality, body and sexuality. A comprehensive policy of the City entitled ‘Aarhus for us all’ is addressing accessibility issues for people with special needs. The policy reflects the fact that the City has a holistic

In [61]:
# Import to Omeka
from OmekaAPI import OmekaAPI
from OmekaAPI import EU_CATALOGUE_2022, EU_CATALOGUE_2023
from OmekaAPI import STT_APPLICATION_TYPE

omeka = OmekaAPI('https://sttobservatory.omeka.net', '80474f200ef8d2ae09caab85d5b03761435b1318')
# src_pdf = fitz.open("../Catalogues/Catalogue European.pdf")
src_pdf = fitz.open("../Catalogues/2023-leading-practices-in-smart-tourism.pdf")
# stts_folder = "../EU_2022"
stts_folder = "../EU_2023"

# Choose between EU_2022 folder or EU_2023 folder
for stt_index, stt in enumerate(stts_founded):
    if stt.city != "":
        stt_pdf = fitz.open()
        
        stt_pdf.insert_pdf(src_pdf, from_page=stt.initial_pdf_page, to_page=stt.final_pdf_page)
        
        stt_name_without_special_chars = stt.name.replace("/","").replace("\"","")
        
        stt_pdf_name = f"{stts_folder}/{stt.initial_pdf_page}_{stt_name_without_special_chars}.pdf".replace(":","")
                
        stt_pdf.save(stt_pdf_name)
    
        item_id = omeka.post_item(
            title=stt.name,
            description=stt.description,
            item_type=STT_APPLICATION_TYPE,
            urls=stt.urls,
            collection=omeka.get_collection_id_by_name(EU_CATALOGUE_2023),
            address=stt.city
        )
    
        for index, image in enumerate(stt.images):
            omeka.post_file_for_item(item_id, f"{stt.name}_{index}.jpg", image)
        
        with open(stt_pdf_name, 'rb') as f:
            omeka.post_file_for_item(item_id, stt_pdf_name, f.read())
    
        try:
            omeka.post_geolocation_for_item(stt.city,item_id)
        except AttributeError:
            continue




Post Item for Inclusive initiatives, 355: Created
Post File to Item 355: Created
Post Geolocation to Item 355: Created

Post File to Item 356: Created
Post File to Item 356: Created
Post Geolocation to Item 356: Created

Post Item for Accessible and Safe celebrations, 357: Created
Post File to Item 357: Created
Post Geolocation to Item 357: Created

Post Item for Accessible beaches / Disabled Friendly Beaches, 358: Created
Post File to Item 358: Created
Post File to Item 358: Created
Post Geolocation to Item 358: Created

Post Item for Facilitated beach access for people with disabilities, 359: Created
Post File to Item 359: Created
Post File to Item 359: Created
Post Geolocation to Item 359: Created

Post Item for Accessibility on the Poniente Beach, 360: Created
Post File to Item 360: Created
Post File to Item 360: Created
Post Geolocation to Item 360: Created

Post Item for Making Beaches Accessible, 361: Created
Post File to Item 361: Created
Post File to Item 361: Created
Post Ge