In [1]:
from PyPDF2 import PdfReader
from googletrans import Translator
import re
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import string
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('portuguese'))
translator = Translator(service_urls=['translate.googleapis.com'])

[nltk_data] Downloading package punkt to /home/eduardo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eduardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import spacy
from spacy.lang.pt.examples import sentences

nlp = spacy.load("pt_core_news_sm")

In [3]:
import requests


def translate_long_text(text, dest='en', chunk_size=4500):
    """
    Translates text using Google Translate API directly
    """
    translated = []

    for start in range(0, len(text), chunk_size):
        piece = text[start:start + chunk_size]

        url = "https://translate.googleapis.com/translate_a/single"
        params = {
            "client": "gtx",
            "sl": "auto",
            "tl": dest,
            "dt": "t",
            "q": piece
        }

        response = requests.get(url, params=params)
        if response.status_code == 200:
            translations = response.json()[0]
            translated_piece = ''.join([t[0] for t in translations if t[0]])
            translated.append(translated_piece)

    return ' '.join(translated)


In [6]:
def clean_text(text):
    '''
    Remove: references style[1], \n
    :param text:
    :return:
    '''
    pattern = r'\d*https?://\S+|[\w.+-]+@[\w-]+\.[\w.-]+'
    text = text.replace("\u200b", "")
    text = text.replace('- ', '')
    text = text.replace(' -', '')
    text = text.replace('\n', '')
    text = re.sub(r'([A-Za-z])-\s+([A-Za-z])', r'\1\2', text)
    text = re.sub(r'\[[^\]]+\]', '', text)
    text = re.sub(r'\d*https?://\S+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(pattern, '', text)
    text = re.sub(r'(?:REFER[EÊ]NCIAS?|REFERENCES).*$', '', text, flags=re.DOTALL).strip()
    # text = re.sub(r'\d+', '', text)
    return text

In [7]:
import os
import pickle


def process_pdfs(input_dir: str, out_dir: str):
    for fname in os.listdir(input_dir):
        if not fname.lower().endswith(".pdf"):
            continue

        path = os.path.join(input_dir, fname)
        print(f"Processing: {path}")

        name = path.split('/')[-1].replace('.pdf', '')

        reader = PdfReader(path)
        full_text = []
        for page in reader.pages:
            full_text.append(page.extract_text())

        full_text = ' '.join(full_text)

        full_text = clean_text(full_text)

        # full_text = translate_long_text(full_text, dest='pt')
        # #
        # full_text = clean_text(full_text)

        with open(f"{out_dir}/{name}.pkl", "wb") as f:
            pickle.dump(full_text, f)

In [10]:
path_in = '../articles/original/portuguese'
path_out = '../articles/processed'
process_pdfs(path_in, path_out)

Processing: ../articles/original/portuguese/985-24772-1-10-20240923.pdf
Processing: ../articles/original/portuguese/985-24774-1-10-20240923.pdf
Processing: ../articles/original/portuguese/985-24763-2-10-20241004.pdf
Processing: ../articles/original/portuguese/985-24749-1-10-20240923.pdf
Processing: ../articles/original/portuguese/985-24773-1-10-20240923.pdf
Processing: ../articles/original/portuguese/985-24757-1-10-20240923.pdf
Processing: ../articles/original/portuguese/985-24775-1-10-20240923.pdf
Processing: ../articles/original/portuguese/985-24753-2-10-20241004.pdf
Processing: ../articles/original/portuguese/985-24761-1-10-20240923.pdf
Processing: ../articles/original/portuguese/985-24748-1-10-20240923.pdf
Processing: ../articles/original/portuguese/985-24770-1-10-20240923.pdf
Processing: ../articles/original/portuguese/985-24771-2-10-20240925.pdf
Processing: ../articles/original/portuguese/985-24743-1-10-20240923.pdf
Processing: ../articles/original/portuguese/985-24742-1-10-20240

In [11]:
with open(f"{path_out}/985-24760-1-10-20240923.pkl", "rb") as f:
    full_text = pickle.load(f)
full_text

full_text = re.sub(r'REFERÊNCIAS?.*$', '', full_text, flags=re.DOTALL).strip()
full_text

'Jogos Digitais Sérios usados para o Exercício de Habilidades doPensamento Computacional em Criançascom Transtorno do Espectro AutistaKatherin Felipa Carhuaz MalpartidaInstituto de Ciências Matemáticas e de ComputaçãoUniversidade de São Paulo (USP)São Carlos, São Paulo,  Rios da Hora RodriguesInstituto de Ciências Matemáticas e de ComputaçãoUniversidade de São Paulo (USP)São Carlos, São Paulo,  Thinking (CT) is a reasoning process focused onsolving problems, promoting the development of cognitive skills. Se-rious games can contribute to exercise these skills, as they are play-ful and adaptable tools for different audiences. This paper presentsthe design and evaluation of a medium-fidelity serious digital gameprototype designed to help improve the cognitive skills of childrenwith Autism Spectrum Disorder (ASD), based on the fundamentalprinciples of PC. The first phase of the game design was completedand evaluated by Special Education professionals from a partnerinstitution. The observat

In [12]:
with open(f"{path_out}/985-24763-2-10-20241004.pkl", "rb") as f:
    full_text = pickle.load(f)
full_text

full_text = re.sub(r'(?:REFER[EÊ]NCIAS?|REFERENCES).*$', '', full_text, flags=re.DOTALL).strip()
full_text


'O Impacto de Estratégias de Embeddings de Grafos naExplicabilidade de Sistemas de RecomendaçãoAndré Levi  de São PauloSão Carlos, São Paulo, BrasilLeonardo  Federal de São Joãodel-ReiSão João del Rei, Minas Gerais, BrasilMarcelo Garcia  de São PauloSão Carlos, São Paulo, BrasilABSTRACTExplanations in recommender systems are essential in improvingtrust, transparency, and persuasion. Recently, using KnowledgeGraphs (KG) to generate explanations gained attention due to thesemantic representation of information in which items and their at-tributes are represented as nodes, connected by edges, representingconnections among them. Model-agnostic KG explainable algo-rithms can be based on syntactic approaches or graph embeddings.The impact of graph embedding strategies in generating meaning-ful explanations still needs to be studied in the literature. To fill thisgap, in this work, we evaluate the quality of explanations providedby different graph embeddings and compare them with traditionals

In [None]:
# # Testing
# text = reader_ingles.pages[0].extract_text()
# text = clean_text(text)
# text = translate_long_text(text, dest='pt')
# text = clean_text(text)

In [13]:
def list_pickle_files(folder: str) -> list:
    pickles = []
    for fname in os.listdir(folder):
        if fname.lower().endswith(('.pkl', '.pickle')):
            pickles.append(os.path.join(folder, fname))
    return pickles


# Example usage:
folder_path = "../articles/processed"
files = list_pickle_files(folder_path)

In [15]:
def extract_abstract(text: str) -> str:
    """
    Return the substring of `text` before the section heading INTRODUÇÃO.
    Matches INTRODUÇÃO (with or without cedilla, case‑insensitive).
    """
    # look for INTRODUÇÃO (or INTRODUCAO) as a whole word
    m = re.search(r'\bINTRODU[CÇ]ÃO\b', text, flags=re.IGNORECASE)
    if m:
        # take everything before that match
        return text[:m.start()].strip()
    else:
        # fallback: no heading found, return entire text
        return text.strip()

- Numero de sentenças

- Numero medio de sentenças

- Numero de tokens

- Numero medio de tokens

 - Top-10 tokens

- Down-10 tokens

- Numero de substativos

- Numero de verbos

- Numero de preposições

In [14]:
def top_n_tokens(text: str, n: int = 10):
    # 1) tokenize and lowercase
    tokens = word_tokenize(text, language='portuguese')
    tokens = [t.lower() for t in tokens]
    tokens = [
        t for t in tokens
        if t.isalpha()
           and t not in stop_words
    ]
    counts = Counter(tokens)
    least = sorted(counts.items(), key=lambda x: x[1])
    return Counter(tokens).most_common(n), least[:n]

In [15]:
def count_pos_tag(text: str):
    '''
    Count the number of nouns, verbs and prepositions in a text.
    :param text:
    :return:
    '''
    doc = nlp(text)
    counts = {'nouns': 0, 'verbs': 0, 'prepositions': 0}

    for tok in doc:
        if tok.pos_ in ('NOUN', 'PROPN'):  # common & proper nouns
            counts['nouns'] += 1
        elif tok.pos_ == 'VERB':  # verbs
            counts['verbs'] += 1
        elif tok.pos_ == 'ADP':  # adpositions
            counts['prepositions'] += 1

    return counts

In [16]:
total_sentences = 0
total_tokens = 0

for paper in files:
    with open(paper, "rb") as f:
        text = pickle.load(f)
    print('*' * 100)
    print('Artigo:', paper)
    print('*' * 100)
    tokens = word_tokenize(text)
    sentences = sent_tokenize(text, language='portuguese')
    total_sentences += len(sentences)
    total_tokens += len(tokens)
    print(f"Tokens: {len(tokens)}")
    print('*' * 100)
    print(f"Sentences: {len(sentences)}")
    print('*' * 100)
    top, down = top_n_tokens(text, 10)
    print('Top-10 tokens:', top)
    print('*' * 100)
    print('Down-10 tokens:', down)
    print('*' * 100)
    print('Qtd Substantivos:', count_pos_tag(text)['nouns'], )
    print('Qtd Verbos:', count_pos_tag(text)['verbs'])
    print('Qtd Preposições:', count_pos_tag(text)['prepositions'])

print('*' * 100)
print('*' * 100)
print('Número Médio de Tokens:', total_tokens / len(files))
print('Número Médio de Sentenças:', total_sentences / len(files))

****************************************************************************************************
Artigo: ../articles/processed/985-24753-2-10-20241004.pkl
****************************************************************************************************
Tokens: 9050
****************************************************************************************************
Sentences: 160
****************************************************************************************************
Top-10 tokens: [('undersampling', 49), ('métodos', 49), ('classe', 47), ('dados', 36), ('viés', 35), ('instâncias', 33), ('modelos', 28), ('transformers', 27), ('efetividade', 25), ('classificação', 25)]
****************************************************************************************************
Down-10 tokens: [('emclassificação', 1), ('transformersguilherme', 1), ('brazilgabriel', 1), ('brazilwashington', 1), ('brazilmarcos', 1), ('brazilleonardo', 1), ('brazilabstractautomatic', 1), ('text', 1)