In [7]:
import re
import os
import logging

def clean_file(file_path):
    # Limpa o conteúdo de um arquivo markdown removendo por secções entre os patterns

    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Check if the file is in English based on the file name
    is_english = 'en_' in file_path.lower()

    if is_english:
        # Patterns for English files
        start_pattern = r": https://www\.ulusofona\.pt/en/ : https://www\.filmeu\.eu/"
        mid_pattern = r"https://www\.ulusofona\.pt/assets/images/cinema-logo\.png : https://www\.ulusofona\.pt/en/cinema-fernando-lopes"
        end_pattern = r"Cookie Policy\s+This site uses cookies to offer you a better browsing experience\."
    else:
        # Patterns for Portuguese files
        start_pattern = r": https://www\.ulusofona\.pt/ : https://www\.filmeu\.eu/"
        mid_pattern = r"https://www\.ulusofona\.pt/assets/images/cinema-logo\.png : https://www\.ulusofona\.pt/cinema-fernando-lopes"
        end_pattern = r"Política de Cookies\s+Este website utiliza cookies para lhe proporcionar uma melhor experiência de navegação\."

    # Remove the first unwanted section
    content = re.sub(f"{start_pattern}.*?{mid_pattern}", "", content, flags=re.DOTALL | re.IGNORECASE)

    # Remove the second unwanted section to the end of the file
    content = re.sub(f"{end_pattern}.*", "", content, flags=re.DOTALL | re.IGNORECASE)

    # Remove any leading/trailing whitespace
    content = content.strip()

    # Write the cleaned content back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)
    
    logging.info(f"Cleaned file: {file_path} ({'English' if is_english else 'Portuguese'})")

def process_directory(directory_path):
    # Processa todos os arquivos markdown numa dir
    logging.basicConfig(level=logging.INFO)
    
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".md"):
                try:
                    file_path = os.path.join(root, file)
                    clean_file(file_path)
                except Exception as e:
                    logging.error(f"Error processing {file}: {str(e)}")

# Specify the directory containing the files
directory_path = "out"
process_directory(directory_path)

INFO:root:Cleaned file: out/docentes_jacinto-antonio-rosa-godinho-7861.md (Portuguese)
INFO:root:Cleaned file: out/noticias_mesa-redonda-treino-psicologico-resumo.md (Portuguese)
INFO:root:Cleaned file: out/en_teachers_hugo-fernando-azevedo-barbosa-3989.md (English)
INFO:root:Cleaned file: out/en_porto_bachelor_energy-systems-electrotechnical-engineering_ULP732-10369.md (English)
INFO:root:Cleaned file: out/lisboa_licenciaturas_videojogos-e-aplicacoes-multimedia_ULP2533-25166.md (Portuguese)
INFO:root:Cleaned file: out/evento_amor-violento.md (Portuguese)
INFO:root:Cleaned file: out/en_lisboa_masters_clinical-imaging-in-companion-animals.md (English)
INFO:root:Cleaned file: out/en_lisboa_integrated-masters_architecture_ULHT36-12630.md (English)
INFO:root:Cleaned file: out/en_porto_bachelor_communication-science-and-culture_ULP451-14506.md (English)
INFO:root:Cleaned file: out/en_porto_bachelor_performing-arts-actor-training_ULP1977-10131.md (English)
INFO:root:Cleaned file: out/docente