- Notebook para baixar os metados (incluindo o link do pdf) dos trabalhos no Pantheon 

In [None]:
from dataclasses import dataclass

from bs4 import BeautifulSoup
import requests
import json
import bs4

- Papers baixados são representados pela classe Paper

In [None]:
@dataclass
class Paper:
    title: str
    abstract: str
    keywords: list[str]
    issue_date: str
    language: str
    paper_pdf: str
    abstract_eng: str | None = None
    is_ai: bool | None = None
    year: int | None = None
    id: str | None = None

def extract_keywords(s):
    b = []
    j = 0
    for i,c in enumerate(s):
        if i < len(s)-1:
            if c.islower() and s[i+1].isupper():
                b.append(s[j:i+1])
                j = i+1
        else:
            b.append(s[j:])
    return b

def load_papers(json_file_path):
    
    with open(json_file_path) as f:
        papers_dict_list = json.load(f)
       
    papers = [Paper(**paper_dict) for paper_dict in papers_dict_list]

    if papers[0].__dict__.get('year') is None:
        for paper in papers:
            paper.year = int(paper.issue_date.split('-')[-1])

    return papers

def save_papers_to_json(papers: list[Paper], json_file_path):
     
    papers_as_dicts = [asdict(paper) for paper in papers]
     
    with open(json_file_path, 'w') as json_file:
        json.dump(papers_as_dicts, json_file)

- Função que recebe uma url do Pantheon com a página listando os trabalhos e baixa o conteúdo da página, salvando as informações de cada trabalho em um objeto Paper

In [None]:
def get_papers(url):

    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    
    urls = []
    for link in soup.find_all('a'):
        ref = link.get('href').split('/')
        if len(ref) == 4 and ref[0] == '':
            urls.append(link.get('href'))

    papers = []
    for url in urls:
        reqs = requests.get(f'https://pantheon.ufrj.br{url}')
        soup = BeautifulSoup(reqs.text, 'html.parser')

        paper_dict = {}
        for tr in soup.find("table", class_='table itemDisplayTable'):
            if isinstance(tr, bs4.element.NavigableString):
                continue
            row = [td.text for td in tr.find_all('td')]
            key = row[0].split(':')[0].lower().replace(' ', '_').replace('/', '_')
            if key in Paper.__dataclass_fields__.keys():
                if key == 'abstract' and key in paper_dict:
                    key = f'{key}_eng'
                if key == 'keywords':
                    paper_dict[key] = extract_keywords(row[1])
                else:
                    paper_dict[key] = row[1]
        
        pdfs = set()
        for link in soup.find_all('a'):
            text = link.get('href')
            if text[-4:] == '.pdf':
                pdfs.add(link.get('href'))

        if len(pdfs) == 0:
            #print(paper.title)
            paper_dict['paper_pdf'] = None
        else:
            paper_dict['paper_pdf'] = f'https://pantheon.ufrj.br{pdfs.pop()}'

        paper = Paper(**paper_dict)
        papers.append(paper)  
        
    return papers

- Loop para baixar os dados de todos os trabalhos

In [None]:
tcc_papers = []

for i, index in enumerate(range(0, 14201, 100)):

    url = f'https://pantheon.ufrj.br/handle/11422/11/simple-search?query=&filter_field_1=dateIssued&filter_type_1=equals&filter_value_1=%5B2010+TO+2024%5D&sort_by=dc.date.issued_dt&order=asc&rpp=100&etal=0&start={index}'

    tcc_papers += get_papers(url)

    if (index%1000 == 0 and index != 0) or index in [14100, 14200]:
        save_papers_to_json(tcc_papers, json_file_path=f'papers_tcc_{index}.json')
        
    print(i)

- Código para baixar o pdf de um paper

In [None]:
url = ''
response = requests.get(url)
if response.status_code == 200:
    with open("file.pdf", "wb") as file:
        file.write(response.content)
        print("File downloaded successfully!")