In [8]:
import re
import unicodedata

from datasets import load_from_disk, load_dataset, concatenate_datasets

load_from_hub = True

dataset_id = "clips/mqa"
dataset_filepath = "data/clips_mqa/pt"

blacklist_domains = [
    "ti-enxame.com",
]

prohibited_terms = ["href", "https", "www.", ".html", "volumen caps"]


def format_dataset(row):
    return {
        "id": row["id"],
        "question": row["name"],
        "domain": row["domain"],
        "answer": row["answers"][0]["text"],
    }

def contains_prohibited_term_regex(text):
    pattern = re.compile("|".join(map(re.escape, prohibited_terms)))
    return bool(pattern.search(text))

def remove_links(text):
    url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.sub(url_regex, '', text)

def remove_non_alphanumeric(text):
    return re.sub(r'[^\w\s.,!?;:\'\"-]', '', text)

def remove_long_words(text, max_length=15):
    # Expressão regular para encontrar substrings com mais de 15 caracteres consecutivos
    pattern = r'\S{' + str(max_length + 1) + r',}'
    # Substituir as substrings que correspondem ao padrão por uma string vazia
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def check_if_question_is_big_enough(text):
    if len(text) < 20:
        return False
    elif 20 <= len(text) <= 30:
        if "?" in text[-3:]:
            return True
        return False
    return True


def format_text_fields(text):
    text = unicodedata.normalize("NFKD", text)
    text = remove_long_words(text)
    text = remove_non_alphanumeric(text)
    text = remove_links(text)
    text = text.replace("--", "")
    text = text.replace("\n", "")
    text = text.replace("](", "")
    text = text.replace(" [", "")
    text = text.strip()
    return text

def format_text(row):
    return {
        "text": format_text_fields(row["text"]),
        "question": format_text_fields(row["question"]),
        "answer": format_text_fields(row["answer"]),
    }

def valid_domain(text):
    if (text[-3:] == ".br") or (text[-4:] in [".com", ".net", ".org"]):
        return True
    return False


def contains_prohibited_term_regex(text):
    blacklist = [
        "mundosugar.com.br",
        "aposta",
        "apuesta",
        "sex",
        "porn",
        "penis",
        "vagi",
        "turba",
        "sensual",
    ]
    pattern = re.compile("|".join(map(re.escape, blacklist)))
    return bool(pattern.search(text))

if load_from_hub:
    raw_dataset = load_dataset(dataset_id, language="pt", trust_remote_code=True)["train"]
    raw_dataset = raw_dataset.filter(lambda row: row["answers"][0]["is_accepted"] == True)
    raw_dataset = raw_dataset.filter(lambda row: row["domain"] not in blacklist_domains)
    raw_dataset = raw_dataset.filter(lambda row: not contains_prohibited_term_regex(row["answers"][0]["text"]) )
    raw_dataset = raw_dataset.filter(lambda row: not contains_prohibited_term_regex(row["name"]) )
    raw_dataset = raw_dataset.filter(lambda row: not contains_prohibited_term_regex(row["text"]) )
    raw_dataset = raw_dataset.filter(lambda row: check_if_question_is_big_enough(row["name"]) )
    raw_dataset = raw_dataset.filter(lambda row: len(row["answers"][0]["text"]) > 0)
    raw_dataset = raw_dataset.map(format_dataset)
    dataset = raw_dataset.map(format_text).select_columns(['id', 'bucket', 'domain', 'text', 'question', 'answer'])
    dataset = dataset.filter(lambda row: check_if_question_is_big_enough(row["question"]) )
    dataset = dataset.filter(lambda row: valid_domain(row["domain"]) )
    dataset = dataset.filter(lambda row: not contains_prohibited_term_regex(row["domain"]) )
    dataset.save_to_disk(dataset_filepath)

Filter:   0%|          | 0/5961948 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5836486 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5833940 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5833342 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5626949 [00:00<?, ? examples/s]

Map:   0%|          | 0/5626931 [00:00<?, ? examples/s]

Map:   0%|          | 0/5626931 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5626931 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5536488 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4193948 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/4140677 [00:00<?, ? examples/s]

In [9]:
ds = load_from_disk(dataset_filepath)

In [22]:
df = ds.to_pandas()
df.sample(4)

Unnamed: 0,id,bucket,domain,text,question,answer
239057,8522946988f4d8a684bb21ae92329ec1,2020.4,daazcavernas.com,,como conseguir descontos na targus?,o daazcavernas disponibiliza diversos desconto...
1635583,82ce25d83172bf64e1ee95ff838b3421,2021.21,casamentos.com.br,,Quais ensaios e servicos que Sonimag oferece?,"Sonimag oferece os seguintes tipos de ensaios,..."
1134153,4e8a557cb7d896890d891766d33fe5aa,2021.04,tudosobreprodutos.com.br,,"qual a marca do produto ""whey reforce body siz...","galgrin, integral medica, integral medica, in..."
267888,3567a3627948775daf2c269fd55af633,2020.4,vrbo.com,,Quais sao melhores areas para se hospedar em ...,A Vrbo Brasil oferece diversas opcoes de alugu...
