<a href="https://colab.research.google.com/github/denbonte/rogue-yoda/blob/main/map_nogp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup and Init

In [102]:
%%capture
!pip install icecream

In [146]:
import os
import json

import numpy as np
import pandas as pd

import difflib
import requests
import lxml.html as lh

from icecream import ic

# Webpage Scraping

In [104]:
#url = 'https://nogreenpassdocenti.wordpress.com/s/'

# create a handle, page, to handle the contents of the website
page = requests.get(url)

# store the contents of the website under doc
doc = lh.fromstring(page.content)

# parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [105]:
# for each row, store each first element (header) and an empty list
tr_elements = doc.xpath('//tr')
table_dict = dict()
idx = 0

for t in tr_elements[0]:
  idx += 1
  key = t.text_content()
  print('%d:"%s"'%(idx, key))
  table_dict[key] = list()

1:"Cognome"
2:"Nome"
3:"Qualifica e disciplina"
4:"Ateneo"


In [106]:
keys_list = list(table_dict.keys())

# since out first row is the header, data is stored on the second row onwards
for jdx in range(1, len(tr_elements)):
  # T is our j'th row
  T = tr_elements[jdx]
  
  # if row is not of size 4, the //tr data is not from our table 
  #if len(T) != 10:
  #  break
  
  idx = 0
  
  # iterate through each element of the row
  for t, key in zip(T.iterchildren(), keys_list):
    
    data = t.text_content() 
    #print(key, data)

    # convert everything to string
    try:
      data = str(data)
    except:
      pass

    if data == " " or data == "":
      data = "NA"
    
    table_dict[key].append(data)

# hard fix to the problem with the table (missing Ateneo in the last row)
table_dict["Ateneo"].append("NA")

In [107]:
df = pd.DataFrame(table_dict)

In [108]:
df.head()

Unnamed: 0,Cognome,Nome,Qualifica e disciplina,Ateneo
0,Acquaviva,Graziella,Professore Associato,Università di Torino
1,Adami,Nicola,Ricercatore – Telecomunicazioni,Università degli Studi di Brescia
2,Afriat,Alexander,Professore di Filosofia,Université de Bretagne Occidentale
3,Almarai,Akeel,Professore Associato Lingua e letteratura araba,Università per Stranieri di Siena
4,Alpini,Stefano,Docente a contratto,Università di Pisa


# Data Preprocessing

In [169]:
!rm -r rogue-yoda
!git clone https://github.com/denbonte/rogue-yoda.git

Cloning into 'rogue-yoda'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 39 (delta 14), reused 17 (delta 6), pack-reused 0[K
Unpacking objects: 100% (39/39), done.


In [140]:
mapping_fn = "uni_mapping.json"
mapping_file_path = os.path.join("rogue-yoda", mapping_fn)

In [170]:
with open(mapping_file_path, encoding='utf-8') as fp:
  mapping_dict = json.load(fp)

In [171]:
region_list = list()
hardcoded_keys = list(mapping_dict.keys())

for uni in df["Ateneo"].values:

  closest_match = difflib.get_close_matches(uni, hardcoded_keys)[0]
  region_list.append(mapping_dict[closest_match])

In [172]:
df["Regione"] = region_list

In [173]:
df.head()

Unnamed: 0,Cognome,Nome,Qualifica e disciplina,Ateneo,Regione
0,Acquaviva,Graziella,Professore Associato,Università di Torino,Piemonte
1,Adami,Nicola,Ricercatore – Telecomunicazioni,Università degli Studi di Brescia,Lombardia
3,Almarai,Akeel,Professore Associato Lingua e letteratura araba,Università per Stranieri di Siena,Toscana
4,Alpini,Stefano,Docente a contratto,Università di Pisa,Toscana
5,Amato,Pierandrea,Professore di Estetica,Università di Messina,Sicilia


In [174]:
droplist = ["NA", "Abroad"]
df = df.drop(df[(df["Regione"] == "Abroad") | (df["Regione"] == "NA")].index)

df.head()

Unnamed: 0,Cognome,Nome,Qualifica e disciplina,Ateneo,Regione
0,Acquaviva,Graziella,Professore Associato,Università di Torino,Piemonte
1,Adami,Nicola,Ricercatore – Telecomunicazioni,Università degli Studi di Brescia,Lombardia
3,Almarai,Akeel,Professore Associato Lingua e letteratura araba,Università per Stranieri di Siena,Toscana
4,Alpini,Stefano,Docente a contratto,Università di Pisa,Toscana
5,Amato,Pierandrea,Professore di Estetica,Università di Messina,Sicilia


In [175]:
tot_signatures = 0

for region in np.unique(df["Regione"]):
  n_signatures = len(df[df["Regione"] == region])
  ic(region, n_signatures)

  tot_signatures += n_signatures

_ = ic(tot_signatures)

ic| region: 'Abruzzo', n_signatures: 6
ic| region: 'Basilicata', n_signatures: 1
ic| region: 'Calabria', n_signatures: 5
ic| region: 'Campania', n_signatures: 28
ic| region: 'Emilia Romagna', n_signatures: 26
ic| region: 'Friuli Venezia Giulia', n_signatures: 11
ic| region: 'Lazio', n_signatures: 33
ic| region: 'Liguria', n_signatures: 14
ic| region: 'Lombardia', n_signatures: 44
ic| region: 'Marche', n_signatures: 12
ic| region: 'Milano', n_signatures: 1
ic| region: 'Molise', n_signatures: 2
ic| region: 'Piemonte', n_signatures: 23
ic| region: 'Puglia', n_signatures: 7
ic| region: 'Sardegna', n_signatures: 8
ic| region: 'Sicilia', n_signatures: 17
ic| region: 'Toscana', n_signatures: 37
ic| region: 'Umbria', n_signatures: 5
ic| region: 'Veneto', n_signatures: 11
ic| tot_signatures: 291


# Plotting