## Script to process Plataforma Sucupira's sheets 

Proposal is to return the updated list of UFPB's graduate programs.

In [1]:
# LISTA DE PPGs - PLATAFORMA SUCUPIRA

# Graduate program tables are found on Sucupira platform.
# Here, he ideal solution is to implement JS scraping with
# selenium, dryscraper or equivalent tool.
# However, this is cumbersome for the simple scraping we need. 
# For the moment, the workaround is to download 
# the XLS files from these links and handle them.
# REMARK: the spreadsheets might be updated by CAPES periodically
# or at a 4-year basis after the QUADRIENAL.

#sucupira_links = {
#    'UFPB-JP':'https://sucupira.capes.gov.br/sucupira/public/consultas/coleta/programa/quantitativos/quantitativoPrograma.jsf?areaAvaliacao=0&cdRegiao=2&sgUf=PB&ies=338423',
#    'UFPB-Areia':'https://sucupira.capes.gov.br/sucupira/public/consultas/coleta/programa/quantitativos/quantitativoPrograma.jsf?areaAvaliacao=0&cdRegiao=2&sgUf=PB&ies=338424',
#    'UFPB-RioTinto':'https://sucupira.capes.gov.br/sucupira/public/consultas/coleta/programa/quantitativos/quantitativoPrograma.jsf?areaAvaliacao=0&cdRegiao=2&sgUf=PB&ies=338425'}    


# TABELA DE ÁREAS DE CONHECIMENTO
# http://lattes.cnpq.br/documents/11871/24930/TabeladeAreasdoConhecimento.pdf/d192ff6b-3e0a-4074-a74d-c280521bd5f7

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
base = '../input/sucupira/'

sheets = os.listdir(base)

ok = {}
for prefix in ['JP','AREIA','RT']:
    name = f'UFPB-{prefix}.xls'
    if sheets.count(name) == 1:
        ok[name] = True
    else:
        ok[name] = False
                    
if not all(ok.values()):
    raise NameError(f'Sucupira files should be renamed to:\n {list(ok.keys())}')
    

fix = {'UFPB-JP':'João Pessoa',
       'UFPB-AREIA':'Areia',
       'UFPB-RT':'Rio Tinto'}    

modalidade = {'ME':'Mestrado',
              'DO':'Doutorado',
              'MP':'Mestrado Profissional',
              'DP':'Doutorado Profissional'}

In [5]:
dfs = []
for f in ok.keys():
    campus,_ = f.split('.')
    df = pd.read_excel(base + f)
    df['Sigla da IES'] = df['Sigla da IES'].str.replace(campus,fix[campus])
    c = np.full(len(df),fix[campus])
    df['Campus'] = c    
    df = df[['Nome do Programa', 'Campus','Código do Programa', 'ME', 'DO', 'MP', 'DP']]
    dfs.append(df)
    
lista_ppg = pd.concat(dfs)
lista_ppg['Nome do Programa'] = lista_ppg['Nome do Programa'].apply(lambda x: x.upper())
lista_ppg = lista_ppg.sort_values(by='Nome do Programa')
lista_ppg = lista_ppg.rename(columns={'Nome do Programa':'nome_ppg','Campus':'campus','Código do Programa':'codigo_CAPES'})
lista_ppg.to_csv(os.path.join(os.pardir,'input','lista-ppg.csv'))