In [3]:
import re
import unicodedata

from datasets import load_from_disk, load_dataset, concatenate_datasets

load_from_hub = True

dataset_id = "clips/mqa"
dataset_filepath = "data/clips_mqa/pt"

blacklist_domains = [
    "ti-enxame.com",
]

prohibited_terms = ["href", "https", "www.", ".html", "volumen caps"]


def format_dataset(row):
    return {
        "id": row["id"],
        "question": row["name"],
        "domain": row["domain"],
        "answer": row["answers"][0]["text"],
    }

def contains_prohibited_term_regex(text):
    pattern = re.compile("|".join(map(re.escape, prohibited_terms)))
    return bool(pattern.search(text))

def remove_links(text):
    url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.sub(url_regex, '', text)

def remove_non_alphanumeric(text):
    return re.sub(r'[^\w\s.,!?;:\'\"-]', '', text)

def remove_long_words(text, max_length=15):
    # Expressão regular para encontrar substrings com mais de 15 caracteres consecutivos
    pattern = r'\S{' + str(max_length + 1) + r',}'
    # Substituir as substrings que correspondem ao padrão por uma string vazia
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def check_if_question_is_big_enough(text):
    if len(text) < 20:
        return False
    elif 20 <= len(text) <= 30:
        if "?" in text[-3:]:
            return True
        return False
    return True


def format_text_fields(text):
    text = unicodedata.normalize("NFKD", text)
    text = remove_long_words(text)
    text = remove_non_alphanumeric(text)
    text = remove_links(text)
    text = text.replace("--", "")
    text = text.replace("\n", "")
    text = text.replace("](", "")
    text = text.replace(" [", "")
    text = text.strip()
    return text

def format_text(row):
    return {
        "text": format_text_fields(row["text"]),
        "question": format_text_fields(row["question"]),
        "answer": format_text_fields(row["answer"]),
    }


if load_from_hub:
    raw_dataset = load_dataset(dataset_id, language="pt", trust_remote_code=True)["train"]
    raw_dataset = raw_dataset.filter(lambda row: row["answers"][0]["is_accepted"] == True)
    raw_dataset = raw_dataset.filter(lambda row: row["domain"] not in blacklist_domains)
    raw_dataset = raw_dataset.filter(lambda row: not contains_prohibited_term_regex(row["answers"][0]["text"]) )
    raw_dataset = raw_dataset.filter(lambda row: not contains_prohibited_term_regex(row["name"]) )
    raw_dataset = raw_dataset.filter(lambda row: not contains_prohibited_term_regex(row["text"]) )
    raw_dataset = raw_dataset.filter(lambda row: check_if_question_is_big_enough(row["name"]) )
    raw_dataset = raw_dataset.filter(lambda row: len(row["answers"][0]["text"]) > 0)
    raw_dataset = raw_dataset.map(format_dataset)
    dataset = raw_dataset.map(format_text).select_columns(['id', 'bucket', 'domain', 'text', 'question', 'answer'])
    dataset = dataset.filter(lambda row: check_if_question_is_big_enough(row["question"]) )
    dataset.save_to_disk(dataset_filepath)

In [4]:
1

1