In [4]:
import re
import os
import logging

def clean_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Pattern to match the start of the unwanted content
    start_pattern = r": https://www\.ulusofona\.pt/ : https://www\.filmeu\.eu/"
    
    # Pattern to match the end of the first unwanted section
    mid_pattern = r"https://www\.ulusofona\.pt/assets/images/cinema-logo\.png : https://www\.ulusofona\.pt/cinema-fernando-lopes"
    
    # Pattern to match the start of the second unwanted section
    end_pattern = r"Política de Cookies\s+Este website utiliza cookies para lhe proporcionar uma melhor experiência de navegação\."

    # Remove the first unwanted section
    content = re.sub(f"{start_pattern}.*?{mid_pattern}", "", content, flags=re.DOTALL)

    # Remove the second unwanted section to the end of the file
    content = re.sub(f"{end_pattern}.*", "", content, flags=re.DOTALL)

    # Remove any leading/trailing whitespace
    content = content.strip()

    # Write the cleaned content back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)
    
    logging.info(f"Cleaned file: {file_path}")

def process_directory(directory_path):
    logging.basicConfig(level=logging.INFO)
    
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".md"):
                try:
                    file_path = os.path.join(root, file)
                    clean_file(file_path)
                except Exception as e:
                    logging.error(f"Error processing {file}: {str(e)}")

# Specify the directory containing the files
directory_path = "out"
process_directory(directory_path)

INFO:root:Cleaned file: out/lisboa_licenciaturas_artes-dramaticas-formacao-de-atores_ULP1977-15443.md
INFO:root:Cleaned file: out/lisboa_licenciaturas_psicologia_ULHT35-15404.md
INFO:root:Cleaned file: out/lisboa_mestrados_comunicacao-marketing-e-media-digitais_docentes.md
INFO:root:Cleaned file: out/en_lisboa_masters_film-studies.md
INFO:root:Cleaned file: out/lisboa_mestrados_protecao-civil_ULP2600-17010.md
INFO:root:Cleaned file: out/lisboa_licenciaturas_informatica-de-gestao_docentes.md


In [6]:
import re
import os
import logging

def clean_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Check if the file is in English based on the file name
    is_english = 'en_' in file_path.lower()

    if is_english:
        # Patterns for English files
        start_pattern = r": https://www\.ulusofona\.pt/en/ : https://www\.filmeu\.eu/"
        mid_pattern = r"https://www\.ulusofona\.pt/assets/images/cinema-logo\.png : https://www\.ulusofona\.pt/en/cinema-fernando-lopes"
        end_pattern = r"Cookie Policy\s+This site uses cookies to offer you a better browsing experience\."
    else:
        # Patterns for Portuguese files
        start_pattern = r": https://www\.ulusofona\.pt/ : https://www\.filmeu\.eu/"
        mid_pattern = r"https://www\.ulusofona\.pt/assets/images/cinema-logo\.png : https://www\.ulusofona\.pt/cinema-fernando-lopes"
        end_pattern = r"Política de Cookies\s+Este website utiliza cookies para lhe proporcionar uma melhor experiência de navegação\."

    # Remove the first unwanted section
    content = re.sub(f"{start_pattern}.*?{mid_pattern}", "", content, flags=re.DOTALL | re.IGNORECASE)

    # Remove the second unwanted section to the end of the file
    content = re.sub(f"{end_pattern}.*", "", content, flags=re.DOTALL | re.IGNORECASE)

    # Remove any leading/trailing whitespace
    content = content.strip()

    # Write the cleaned content back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)
    
    logging.info(f"Cleaned file: {file_path} ({'English' if is_english else 'Portuguese'})")

def process_directory(directory_path):
    logging.basicConfig(level=logging.INFO)
    
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".md"):
                try:
                    file_path = os.path.join(root, file)
                    clean_file(file_path)
                except Exception as e:
                    logging.error(f"Error processing {file}: {str(e)}")

# Specify the directory containing the files
directory_path = "outt"
process_directory(directory_path)

INFO:root:Cleaned file: outt/en_lisboa_masters_sound-production-and-technology_ULHT2722-17419.md (English)
INFO:root:Cleaned file: outt/docentes_daniel-dos-santos-cardoso-3742.md (Portuguese)
INFO:root:Cleaned file: outt/en_porto_bachelor_management.md (English)
INFO:root:Cleaned file: outt/en_lisboa_integrated-masters_pharmaceutical-sciences_ULHT477-17207.md (English)
INFO:root:Cleaned file: outt/porto_licenciaturas_design-de-comunicacao_ULP729-3495.md (Portuguese)
INFO:root:Cleaned file: outt/candidaturas.md (Portuguese)
INFO:root:Cleaned file: outt/lisboa_erasmus-mundus_cyber-cyberspace-behavior-and-e-therapy-european-joint-master-degree-erasmus-mundus_ULHT6441-23806.md (Portuguese)
INFO:root:Cleaned file: outt/lisboa_erasmus-mundus_cyber-cyberspace-behavior-and-e-therapy-european-joint-master-degree-erasmus-mundus_ULHT6441-24519.md (Portuguese)
INFO:root:Cleaned file: outt/en_porto_masters_neuropsicologia-clinica_ULP6819-25467.md (English)
INFO:root:Cleaned file: outt/porto_licenci