In [5]:
import re
import fitz


def check_stt_name_conditions(span_dict):
    if span_dict['font'] == "OpenSans-Bold" and span_dict['size'] == 10.5 and span_dict['color'] == 0 and not bool(re.search(r'\d\.\d',span_dict['text'])):
        return True
    return False

def check_stt_description_conditions(span_dict):
    if span_dict['font'] == "OpenSans" and (span_dict['size'] == 10.5 or span_dict['size'] == 10.0) and span_dict['color'] == 0:
        return True
    return False

def check_stt_url_conditions(span_dict):
    if span_dict['font'] == "OpenSans-Light" and span_dict['size'] == 10.5 and span_dict['color'] == 1393044:
        return True
    return False

class STT:

    def __init__(self, title, bbox, initial_pdf_page):
        tmp = []
        if ";" in title:
            tmp = title.split(";")
        elif "," in title:
            tmp = title.split(",")
        self.name = tmp[0] if len(tmp) > 1 else title
        self.city = "" if len(tmp) <= 1 else tmp[-1].replace(":", "")[1:]
        self.description = ""
        self.urls = []
        self.images = []
        self.y = float(bbox[1])
        self.initial_pdf_page = initial_pdf_page
        self.final_pdf_page = initial_pdf_page

    def set_description(self, description, current_pdf_page):
        if self.description != "" and description[0] != " " and self.description[-1] == ".":
            self.description += "\n"
        self.description += description
        self.set_final_pdf_page(current_pdf_page)


    def set_url(self, url, current_pdf_page):
        # print(f"Set url for {stt.name}. Current page: {current_pdf_page}")
        if url.startswith("http") or url.startswith("www"):
            self.urls.append(url)
        else:
            self.urls[-1] += url
        self.set_final_pdf_page(current_pdf_page)

    def set_image(self, image_data):
        self.images.append(image_data)
        
    def set_final_pdf_page(self, final_pdf_page):
        self.final_pdf_page = final_pdf_page

    def __str__(self):
        return f"\nSTT: {self.name}\nDescription: {self.description}\nURL: {self.urls}\nNumber of images: {len(self.images)}\nCity: {self.city}\nInitial page:{self.initial_pdf_page}\nFinal page:{self.final_pdf_page}"

stts_founded = []

def append_stt(span_dict, pag_number):
    stts_founded.append(STT(span_dict['text'], span_dict['bbox'], pag_number))


def span_is_title(span_dict, pag_number):
    # Caso o título esteja dividido em vários spans
    if len(stts_founded) > 0 and stts_founded[-1].description == "":
        # Resolver caso de Venice pag 12
        if (":" in span_dict['text'] and ";" in span_dict['text'] 
                and float(span_dict['bbox'][1]) - stts_founded[-1].y > 16.00):
            # A distância entre linhas consecutivas é +/- 15.00
            stts_founded.remove(stts_founded[-1])
            append_stt(span_dict, pag_number)
        # Irá entrar no ELIF caso o current span conter a cidade
        elif ":" in span_dict['text']:
            if ";" in span_dict['text']:
                tmp = span_dict['text'].split(";")
                stts_founded[-1].name += tmp[0]
                stts_founded[-1].city = tmp[1].replace(":", "").replace(" ", "")
            elif "," in span_dict['text']:
                tmp = span_dict['text'].split(",")
                stts_founded[-1].name += tmp[0]
                stts_founded[-1].city = tmp[1].replace(":", "").replace(" ", "")
            else:
                stts_founded[-1].name = stts_founded[-1].name.replace(";", "")
                stts_founded[-1].city = span_dict['text'].replace(":", "").replace(" ", "")
        else: stts_founded[-1].name += span_dict['text']
    else: append_stt(span_dict, pag_number)
    
def analyze_span(span_dict, pag_number):
    if check_stt_name_conditions(span_dict):
        span_is_title(span_dict, pag_number)
        # Caso contenha descrição de uma STT que termine na primeira página deste PDF
    elif len(stts_founded) == 0:
        return
    elif check_stt_description_conditions(span_dict):
        # print(f"Description founded: {span['text']}")
        stts_founded[-1].set_description(span_dict['text'], pag_number)
    elif check_stt_url_conditions(span_dict):
        # print(f"URL founded: {span['text']}")
        stts_founded[-1].set_description(span_dict['text'], pag_number)
        stts_founded[-1].set_url(span_dict['text'], pag_number)

with fitz.open("../Catalogues/Catalogue European.pdf") as doc:
    # for page in doc:
    for page_number in range(45, 88):
        page = doc[page_number]
        for block in page.get_text('dict')['blocks']:
            if "lines" in block.keys():
                # print(block['lines'])
                for line in block['lines']:
                    for span in line['spans']:
                        analyze_span(span, page_number)
            elif block['type'] == 1:
                # Caso o block da image não apareca por ordem
                if len(stts_founded) > 0 and stts_founded[-1].y < block['bbox'][1]:
                    stts_founded[-1].set_image(block['image'])
                else:
                    stts_founded[-2].set_image(block['image'])

# Tentar carregar por páginas completas

for stt in stts_founded:
    print(stt)


STT: Sustainable choices for a sustainable stay
Description: With a humorous approach, the “Sustainable Copenhagen” campaign seeks to motivate Copenhagen’s visitors to take care of the city and make sustainable choices during their stay. The campaign consists of 10 simple tips and tricks on how to make ‘little changes for a big impact’. The campaign encourages visitors to travel by bike or to use public transport, drink tap water and reuse water bottles, eat locally, shop sustainably and get off the beaten track.
Find out more at: https://www.wonderfulcopenhagen.com/convention-bureau/copenhagen/sustainable-copenhagen 
URL: ['https://www.wonderfulcopenhagen.com/convention-bureau/copenhagen/sustainable-copenhagen']
Number of images: 0
City: Copenhagen 
Initial page:45
Final page:45

STT: Horta à Porta for a better quality of life
Description: Promoting the population’s quality of life through good agricultural practices is what the initiative Horta à Porta aims for. This project does no

In [6]:
# Import to Omeka
from OmekaAPI import OmekaAPI
from OmekaAPI import EU_CATALOGUE_2022
from OmekaAPI import STT_APPLICATION_TYPE

omeka = OmekaAPI('https://sttobservatory.omeka.net', '80474f200ef8d2ae09caab85d5b03761435b1318')
src_pdf = fitz.open("../Catalogues/Catalogue European.pdf")
stts_folder = "../EU_2022"


for stt_index, stt in enumerate(stts_founded):
    if stt.city != "":
        stt_pdf = fitz.open()
        
        stt_pdf.insert_pdf(src_pdf, from_page=stt.initial_pdf_page, to_page=stt.final_pdf_page)
        
        stt_pdf_name = f"{stts_folder}/{stt.initial_pdf_page}_{stt.name}.pdf".replace(":","")
        
        stt_pdf.save(stt_pdf_name)
    
        item_id = omeka.post_item(
            title=stt.name,
            description=stt.description,
            item_type=STT_APPLICATION_TYPE,
            urls=stt.urls,
            collection=omeka.get_collection_id_by_name(EU_CATALOGUE_2022),
            address=stt.city
        )
    
        for index, image in enumerate(stt.images):
            omeka.post_file_for_item(item_id, f"{stt.name}_{index}.jpg", image)
        
        with open(stt_pdf_name, 'rb') as f:
            omeka.post_file_for_item(item_id, stt_pdf_name, f.read())
    
        try:
            omeka.post_geolocation_for_item(stt.city,item_id)
        except AttributeError:
            continue



cannot create /Annot for kind: 4
cannot create /Annot for kind: 4

Post Item for Sustainable choices for a sustainable stay, 289: Created
Post File to Item 289: Created
Post Geolocation to Item 289: Created
cannot create /Annot for kind: 4
cannot create /Annot for kind: 4

Post Item for Horta à Porta for a better quality of life, 290: Created
Post File to Item 290: Created
Post Geolocation to Item 290: Created
cannot create /Annot for kind: 4
cannot create /Annot for kind: 4
cannot create /Annot for kind: 4
cannot create /Annot for kind: 4

Post Item for A sustainable guide to the city, 291: Created
Post File to Item 291: Created
Post File to Item 291: Created
Post Geolocation to Item 291: Created
cannot create /Annot for kind: 4
cannot create /Annot for kind: 4

Post Item for Zero waste shopping policy by ‘Zelena Japka’, 292: Created
Post File to Item 292: Created
Post Geolocation to Item 292: Created
cannot create /Annot for kind: 4
cannot create /Annot for kind: 4

Post Item for Loc