In [19]:
import os
import gdown
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [3]:
def get_links_from_url(url) -> dict:
  response = requests.get(url)

  if response.status_code != 200:
      print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
      return

  if response.status_code == 200:
      soup = BeautifulSoup(response.text, 'html.parser')
      section_datasets = soup.find('section', {'id': 'dataset-resources'})

      if section_datasets:
          dataset_links = section_datasets.find_all('li', {'class': 'resource-item'})

          links = {}
          for ancor_el in dataset_links:
            content = ancor_el.find('a').get_text().strip()
            head, *tail = content.split('-')
            head = head.strip()

            if head == 'Dicionário de Dados':
              pass
            else:
              ancor_el = ancor_el.find('a', {'class': 'resource-url-analytics'})
              link = ancor_el.get('href')
              filename = link.split('/')[-1]
              links[filename] = link

          return links

In [20]:
def download_files(links, outdir='scraped_data') -> pd.DataFrame:
  os.makedirs(outdir, exist_ok=True)

  for filename, link in tqdm(links.items(), 'download datasets...'):
    file_path = os.path.join(outdir, filename)
    gdown.download(url=link, output=file_path, quiet=True, fuzzy=True)

  dfs = []
  for filename in tqdm(links.keys(), 'concating dataframes'):
    file_path = os.path.join(outdir, filename)
    df = pd.read_csv(filename, sep=';')
    dfs.append(df)

  return pd.concat(dfs)

In [10]:
filenames_links = get_links_from_url('https://dados.ufrn.br/dataset/discentes')
df_discentes = download_files(filenames_links)

download datasets...: 100%|██████████| 35/35 [00:07<00:00,  4.98it/s]
concating dataframes: 100%|██████████| 35/35 [00:01<00:00, 20.29it/s]


In [11]:
df_discentes.nome_discente

0                         ABDENOR BEZERRA DOS SANTOS
1                    ABDIAS MONTEIRO DE ANDRADE MELO
2                      ABDIAS SABINO RODRIGUES FILHO
3                       ABEL GOMES DE OLIVEIRA FILHO
4                        ABI AMANA DE AQUINO BEZERRA
                            ...                     
14454                   ZENEIDE LOBATO REIS DA SILVA
14455    ZENO AUGUSTO BARROS LOPES TEIXEIRA DE MOURA
14456               ZILANE SILVA BARBOSA DE OLIVEIRA
14457           ZILDA DALLIANY LISBOA ARRUDA DE MELO
14458            ZIRALDO TARGINO BEZERRA GOMES SILVA
Name: nome_discente, Length: 375957, dtype: object

In [12]:
df_discentes.nome_discente = df_discentes.nome_discente.str.strip()
df_discentes = df_discentes[df_discentes.nome_discente != '']
df_discentes = df_discentes.nome_discente.str.lower()
df_discentes

0                         abdenor bezerra dos santos
1                    abdias monteiro de andrade melo
2                      abdias sabino rodrigues filho
3                       abel gomes de oliveira filho
4                        abi amana de aquino bezerra
                            ...                     
14454                   zeneide lobato reis da silva
14455    zeno augusto barros lopes teixeira de moura
14456               zilane silva barbosa de oliveira
14457           zilda dalliany lisboa arruda de melo
14458            ziraldo targino bezerra gomes silva
Name: nome_discente, Length: 375957, dtype: object

In [14]:
nomes = df_discentes.apply(lambda x: x.split(' ')[0])

In [15]:
sobrenomes = df_discentes.apply(lambda x: ' '.join(x.split(' ')[1:]))

In [16]:
df_nomes = pd.DataFrame({'nome': nomes, 'sobrenome': sobrenomes})
df_nomes.head(10)

Unnamed: 0,nome,sobrenome
0,abdenor,bezerra dos santos
1,abdias,monteiro de andrade melo
2,abdias,sabino rodrigues filho
3,abel,gomes de oliveira filho
4,abi,amana de aquino bezerra
5,abigail,gabrielli dantas rodrigues albuquerque
6,abigail,keyla de santana
7,abigail,sara paloma silva damasceno
8,abimael,elohim lima de assuncao
9,abimael,esdras carvalho de moura lira


In [17]:
df_nomes.to_parquet('./data/df_nomes.parquet', index=False)