## Scrape undergraduate courses

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import os, pickle

## Customize file names

In [2]:
staff_str = 'ufpb-student-staff'

In [3]:
# read list of courses and their associated links

cursos = pd.read_csv('../input/lista-cursos-THE.csv')

# course names and URLs
cnames = cursos['nome_curso_grad'].values
centers = cursos['sigla_centro'].values
modals = cursos['modalidade'].values
urls = cursos['url_curso_grad'].values

# if on, retrieve data from UFPB portal
scrape_on = False

# if on, export sheets to CSV
to_csv = True

In [4]:
staff_pkl = os.path.join(os.pardir,'pkl',staff_str + '.pkl')

if scrape_on:

    data_courses = {}

    for e in range(len(urls)):

        # course name
        c_name = cnames[e] 
        
        # center acronym
        c_acr = centers[e] 
        
        # modality
        modal = modals[e] 

        # course link
        url = urls[e]


        # get link to page of student list
        bs = BeautifulSoup(urlopen(url).read(),'html.parser')
        alink = [ a['href'] 
                    for a in bs.findAll('a',{'class':'alunos'},
                                                 href=True) ][0]

        # form full link
        alink = 'https://sigaa.ufpb.br/sigaa/public/curso/' + alink

        # request to page of student list
        bsa = BeautifulSoup(urlopen(alink).read(),'html.parser')

        # get list of students
        aux = []
        for m in bsa.findAll('td'):
            aux.append(m.getText().strip())

        proc = aux[:-1]        
        
        # nome/matr. 
        name = proc[1::2]
        matr = proc[0::2] 
        students = dict(zip(name,matr))

        # number of students
        n_students = len(students)

        stud_info = {}
        stud_info['lista-discentes'] = students
        stud_info['link'] = alink
        stud_info['n-discentes'] = n_students
        #print(f'{alink} : {n_students}')

        aux = f'{c_name}:{modal}:{c_acr}'
        data_courses[aux] = stud_info


    with open(staff_pkl,'wb') as f:
        pickle.dump(data_courses,file=f)
        
else:
    
    with open(staff_pkl,'rb') as f:
        data_courses = pickle.load(f,encoding='utf8')    

## Report - no classification

In [5]:
staff_xlsx = os.path.join(os.pardir,'xlsx','no-class','undergrads')


a,b,c,d,e = [],[],[],[],[]
for k,v in data_courses.items():
    cur,mod,cen = k.split(':')
    a.append(cur)
    b.append(mod)
    c.append(cen)    
    d.append(v['link'])
    e.append(v['n-discentes'])


df_curso = pd.DataFrame({'nome_curso_grad':a,'modalidade':b,'centro':c,'lista_discentes':d,'n_discentes':e})
    
file_save = os.path.join(staff_xlsx,'TOTAL.xlsx')
df_curso.to_excel(file_save,index=False,encoding='utf-8')

In [6]:
# checar diferença

len(df_curso), len(cursos)

(113, 120)

## Report - THE classification

In [7]:
''' CHECK

staff_xlsx_the = os.path.join(os.pardir,'xlsx','the','undergrads')

# get THE classes        
the_classes = set(cursos['THE_classificacao'])

# sums members per THE class
totals_the = {}

for c in the_classes:    
    aux = []
    sub = cursos[cursos['THE_classificacao'] == c]
    
    
    for nc in sub['nome_curso_grad'].values:
        n_stud = df_curso[df_curso['nome_curso_grad'] == nc]['n_discentes'].values[0]
        aux.append(n_stud)
    totals_the[c] = sum(aux)

# splits by areas
areas,ns = [],[]
for k,v in totals_the.items():
    areas.append(k)
    ns.append(v)

file_save = os.path.join(staff_xlsx_the, 'TOTAL.xlsx')    
df_the = pd.DataFrame({'areas':areas,'discentes':ns}).sort_values(by='areas')
df_the.to_excel(file_save,index=False)

''';