# UFPB Researcher Hub

**Objective:** this script is intended to scrape UFPB active researcher databases and build a dataframe of research subjects they work with so that we can find research clusters and build a primary hub for international future purposes.

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re, pickle, os

In [None]:
researcher_str = 'ufpb-researcher-staff'

## Scrape UFPB's professor staff list 

- Data retrieval occurs: 
    - either by web scraping Department's websites
    - or by loading pre-serialized data (if any)

- Use `scrape_on=True` for updated web scraping

In [None]:
# read list of departments and their associated links
ppg_list = pd.read_csv('../input/lista-ppg-THE.csv',delimiter=';')

# if on, retrieve data from UFPB portal
scrape_on = False

In [None]:
# staff list
staff_base_link = 'https://sigaa.ufpb.br/sigaa/public/programa/equipe.jsf?lc=pt_BR&id='

## Identify particular PPGs with nonstandard SIGAA URL

In [None]:
exception = ppg_list[ppg_list['nome_curso_posgrad'].str.contains(
    'DESENVOLVIMENTO E MEIO AMBIENTE/REDE|ENGENHARIA DE PRODUÇÃO E SISTEMAS')]
out_of_parsing = exception.index.to_list()

# TODO
# ppgeps = 'http://www.ct.ufpb.br/ppgeps/contents/menu/corpo%20docente'

In [None]:
ppg_name = []
url_staff = []
for i in range(len(ppg_list)): 
    if i not in out_of_parsing:
        ppg_name.append(ppg_list['nome_curso_posgrad'][i])
        _,nid = ppg_list['url_curso_posgrad'][i].split('id=')
        url_staff.append(staff_base_link + nid)

In [None]:
staff_pkl = os.path.join(os.pardir,'pkl',researcher_str + '.pkl')

if scrape_on:

    data = {}
    for i in range(len(ppg_name)):    
        html = urlopen(url_staff[i])
        bs = BeautifulSoup(html.read(),'html.parser')
        staff = bs.body.find_all('a',{"class": "cor"})

        members = []
        member_page = []
        for s in staff:
            members.append(s.getText().strip())
            a,b = s['href'].split('portal')            
            member_page.append('https://sigaa.ufpb.br' + a + 'pesquisa' + b)

        data[ppg_name[i]] = {'members':members,'member_page':member_page}
        
    # pickle data          
    with open(staff_pkl,'wb') as f:
        pickle.dump(data,file=f)

else:
    # load data
    with open(staff_pkl,'rb') as f:
        data = pickle.load(f,encoding='utf8')

In [None]:
for k in data.keys():
    aux = []
    for m in range(len(data[k]['member_page'])):
        print(m,end=' ')
        html = urlopen(data[k]['member_page'][m])
        bs = BeautifulSoup(html.read(),'html.parser')
        area = bs.body.find_all('td',{"class": "area"})
        knowledge_areas = list(set([a.getText().strip() for a in area]))
        aux.append(knowledge_areas)        
    data[k].update({'areas':aux})
    print((k,'concluded'))
    
# pickle data          
with open(staff_pkl,'wb') as f:
    pickle.dump(data,file=f)    

## PPGEPS: particular case

PPGEPS's scraping needs some manual handling because its website is nonstandard.  This way, we:

1. mine the website on CT to get the staff
2. manually search for their SIGAA public page
3. scrape SIGAA to get their knowledge areas. 
4. update the database and pickle.

In [None]:
# manual treatment
ppgeps = 'http://www.ct.ufpb.br/ppgeps/contents/menu/corpo%20docente'
get = BeautifulSoup(urlopen(ppgeps).read(),'html.parser').body.find_all('p',{"class": "callout"})
ppgeps_names = []
for t in get:
    ppgeps_names.append(t.getText().strip().split('\xa0\xa0')[0].upper())
    
ppgeps_sites = [
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=1859144',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=1642093',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=2366533',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=1605391',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=6336620',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=1298891',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=2225575',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=2317198',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=1217340',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=2348485']

# append to data
data['ENGENHARIA DE PRODUÇÃO E SISTEMAS'] = {'members':ppgeps_names,
                                             'member_page':ppgeps_sites}

# get knowledge areas
aux = []
for m in range(len(data['ENGENHARIA DE PRODUÇÃO E SISTEMAS']['member_page'])):    
    html = urlopen(data['ENGENHARIA DE PRODUÇÃO E SISTEMAS']['member_page'][m])
    bs = BeautifulSoup(html.read(),'html.parser')
    area = bs.body.find_all('td',{"class": "area"})
    knowledge_areas = list(set([a.getText().strip() for a in area]))
    aux.append(knowledge_areas)        
data['ENGENHARIA DE PRODUÇÃO E SISTEMAS'].update({'areas':aux})

## PRODEMA/REDE: particular case

PRODEMA/REDE's scraping needs some manual handling because its website is nonstandard. All scrape is manual...

In [None]:
prodema_names = [
    'BARTOLOMEU ISRAEL DE SOUZA',
    'EDUARDO RODRIGUES VIANA DE LIMA',
    'JOEL SILVA DOS SANTOS',
    'LUIZ CARLOS SERRAMO LOPEZ',
    'MARIA CRISTINA BASILIO CRISPIM DA SILVA',
    'MARILIA GABRIELA DOS SANTOS CAVALCANTI',
    #'MARISTELA OLIVEIRA DE ANDRADE', # not found on SIGAA
    'NATALY ALBUQUERQUE DOS SANTOS',
    'RAIMUNDO APRIGIO DE MENEZES JUNIOR',
    #'REINALDO FARIAS PAIVA DE LUCENA' # not found on SIGAA
]

prodema_sites = [
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=4201553',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=338351',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=2560868',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=1668604',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=2335304',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=1644565',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=1964406',
    'https://sigaa.ufpb.br/sigaa/public/docente/pesquisa.jsf?siape=1866226']
    
# append to data
data['DESENVOLVIMENTO E MEIO AMBIENTE/REDE'] = {'members':prodema_names,
                                             'member_page':prodema_sites}

# get knowledge areas
aux = []
for m in range(len(data['DESENVOLVIMENTO E MEIO AMBIENTE/REDE']['member_page'])):    
    html = urlopen(data['DESENVOLVIMENTO E MEIO AMBIENTE/REDE']['member_page'][m])
    bs = BeautifulSoup(html.read(),'html.parser')
    area = bs.body.find_all('td',{"class": "area"})
    knowledge_areas = list(set([a.getText().strip() for a in area]))
    aux.append(knowledge_areas)        
data['DESENVOLVIMENTO E MEIO AMBIENTE/REDE'].update({'areas':aux})


## Repickling

Pickle again to include manually scraped PPGs.

In [None]:
# pickle data          
with open(staff_pkl,'wb') as f:
    pickle.dump(data,file=f)    

## Unfolding data to form single dataframe

In [None]:
ppg,researcher,homepage,keywords = [],[],[],[]
for k in data.keys():
    members = data[k]['members']
    sites = data[k]['member_page']
    
    for _ in range(len(members)):
        ppg.append(k) # replicate PPG name
        researcher.append(members[_]) # member name
        homepage.append(sites[_]) # member SIGAA page
    
    each_area = data[k]['areas']
    for ea in each_area:
        keywords.append(', '.join(ea))    

In [None]:
# create dataframe
PPG_database = pd.DataFrame({'PPG':ppg,
                             'Membro':researcher,
                             'Temas':keywords,
                             'Site':homepage}).sort_values(by='PPG').reset_index(drop=True)

## Breaking list of _n_ keywords into _n_ columns

In [None]:
themes = PPG_database['Temas'].str.split(',')

# biggest string
maxc = max([max(list(map(len,i))) for i in themes])

# get maximum number of keywords among all researchers
sizes = list(map(len,themes))
n_max_keys = max(sizes)

# create empty matrix 
matrix = np.full(shape=(len(themes),n_max_keys),fill_value='0'*maxc)

# fill matrix
for row in range(len(themes)):
    for col in range(sizes[row]):
        matrix[row,col] = themes[row][col].strip()
        
# form dict to create dataframe        
dict_keywords = {}
for col in range(matrix.shape[1]):
    dict_keywords[f'Tema {col+1}'] = matrix[:,col]

# keywords dataframe
df_keys = pd.DataFrame(dict_keywords).replace('0'*maxc,'')

## Create new dataframe with separated research subjects

- `PPG_database_gross` includes all researchers, even those who did not fill information on SIGAA.

- `PPG_database_net` excludes researchers who did not fill information on SIGAA.

In [None]:
# remove 'Temas' to merge separated 'Temas'
PPG_database = PPG_database.drop(columns='Temas')
PPG_database_gross = pd.concat([PPG_database,df_keys],axis=1)

In [None]:
# exclude all empty 
empty = PPG_database_gross.filter(axis=1,items=[f'Tema {col+1}' for col in range(matrix.shape[1])])

to_exclude = []
for i in range(len(empty)):
    aux = empty.iloc[i].apply(lambda x: x == '')
    if all(aux):
        to_exclude.append(i)
        
print(f'Exclusion rate: {len(to_exclude)/len(PPG_database_gross)*100:.0f}%\
 ({len(to_exclude)} of {len(PPG_database_gross)})')

# net database
PPG_database_net = PPG_database_gross.drop(to_exclude).reset_index(drop=True)

## Export dataframes to CSV

In [None]:
PPG_database_gross.to_csv('../csv/PPG-RESEARCH-HUB-GROSS.csv',index=False)
PPG_database_net.to_csv('../csv/PPG-RESEARCH-HUB-NET.csv',index=False)