In [1]:
from collections import Counter
import json
import string
import re
import random
import zipfile

import boto3
from bs4 import BeautifulSoup
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import _hash_file
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import numpy as np
from tqdm import tqdm

Using TensorFlow backend.


Tags are read into pandas' DataFrames

In [2]:
tags_df = pd.read_csv('./opp_data/atti/tags.csv')
tagging_df = pd.read_csv('./opp_data/atti/atti_tags.csv')

## Function definitions
Some functions are defined, in order to handle documents and tags

In [56]:
def get_act_tags(act_id: int) -> list:
    """Return all tags for the act having given id, as triplets:
      0 - the tag id,
      1 - the tag text
      2 - the tag type (teseo|geoteseo|op_geo)
    """
    tags_list = list(tagging_df[tagging_df.act_id==act_id].tags_ids)
    if len(tags_list):
        tags_list = tags_list[0].split(':')[1:]
    return [
        (
            tags_df[tags_df.id==int(t)].iloc[0]['id'], 
            tags_df[tags_df.id==int(t)].iloc[0]['name'], 
            tags_df[tags_df.id==int(t)].iloc[0]['type']
        ) for t in tags_list
    ]

def get_act_tags_ids(act_id: int) -> list:
    """Return all tags ids, for non-geo tags"""
    return [t[0] for t in get_act_tags(act_id) if 'geo' not in t[2]]

def extract_text_from_html(html: string) -> string:
    """Extract text from act's HTML content, 
       removing names of MPs that signed the act
    """
    soup = BeautifulSoup(html, 'html.parser')
    stripped = soup.get_text().replace("\n", " ").replace("  ", " ").strip(' ')
    text = re.sub(r'(\([\d\-]*\)) «(.*)»\.', '', stripped).strip()
    return text

def nltk_process(text: string) -> list:
    """Process a text with NLTK doing the following:
    - splitting into words,
    - converting to lowe case
    - remove punctuation
    - filter out stop words
    
    Returns a list of stemmed words
    """

    # split into words
    tokens = word_tokenize(text)

    # convert to lower case
    tokens = [w.lower() for w in tokens]

    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation)) # remove punctuation from each word
    stripped = [re_punc.sub('', w) for w in tokens]

    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]

    # filter out stop words
    stop_words = set(stopwords.words('italian'))
    words = [w for w in words if not w in stop_words]

    # stemming of words
    # stemmer = SnowballStemmer("italian")
    # stemmed = [stemmer.stem(word) for word in words]

    return words

def preprocess_and_save_docs(
    source_zip: string, dest_name: string, 
    n_docs: int = 100,
    cache_path='./datasets',
    aws_bucket='opp-datasets',
    aws_region='eu-central-1'
):
    """Pre-process ``n_docs`` random documents out of the ones contained in `source_zip` file, 
    pre-cleaning them for serialization onto disk, along with labels (tags).
    
    Data are serialized in the ``dest_npz`` file, with the `.npz` compressed format.
    A vocabulary is serialized in a json file, with thename extracted from dest_npz.

    For each one of the n_docs documents in the zipped file:
    - html is parsed and text content is extracted (beautifulsoup)
    - both the data list and the vocab Counter are updated
    
    :return: a tuple with the MD5 hashes of the npz and json files, respectively
    
    The following data are persisted in ``dest_npz``, using the ``.npz`` format:
      ids:    list of original openparlamento ID (to refer to the original ACT)
      data:   list of original texts (pre-cleaned)
      labels: list of the assigned labels 
              each label is a triple: (ID, name, type)
      vocab:  the complete vocabulary, with occurence counts for each word
    """
    data = []
    vocab = Counter()

    with open(source_zip, 'rb') as tz:
        z = zipfile.ZipFile(tz)
        filelist = random.choices(z.filelist, k=n_docs)
        for fl in tqdm(filelist):
            zf = z.open(fl.filename)
            original_html = zf.read()
            text_content = extract_text_from_html(original_html)
            words = nltk_process(text_content)            
            zf.close()
            id = int(fl.filename.split('_')[0])
            data_f = {
                'id': id,
                'text_content': text_content,
                'tags': get_act_tags(id),
            }
            data.append(data_f)
            vocab.update(words)

    s3 = boto3.client(
        's3',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        region_name=aws_region
    )
    dest_npz = f"{cache_path}/{dest_name}.npz"
    dest_vocab = f"{cache_path}/{dest_name}_vocab.json"

    print(f"Saving data into {dest_npz}")
    np.savez_compressed(
        dest_npz, 
        ids=[doc['id'] for doc in data], 
        texts=[doc['text_content'] for doc in data], 
        labels=[doc['tags'] for doc in data], 
    )
    print(f"Uploading to s3://{aws_bucket}/{dest_name}.npz")
    s3.upload_file(
        dest_npz, aws_bucket, f"{dest_name}.npz",
        ExtraArgs={'ACL': 'public-read'}
    )
    
    print(f"Saving vocab into {dest_vocab}")
    with open(dest_vocab, 'w') as outfile:
        json.dump(vocab, outfile)
    print(f"Uploading to s3://{aws_bucket}/{dest_name}_vocab.json")
    s3.upload_file(
        dest_vocab, aws_bucket, f"{dest_name}_vocab.json",
        ExtraArgs={'ACL': 'public-read'}
    )

    return (
        (f"https://{aws_bucket}.s3.{aws_region}.amazonaws.com/{dest_name}.npz", 
         _hash_file(dest_npz, algorithm='md5')), 
        (f"https://{aws_bucket}.s3.{aws_region}.amazonaws.com/{dest_name}_vocab.json", 
         _hash_file(dest_vocab, algorithm='md5'))
    )

## Documents pre-processing and saving
``n_docs`` documents, randomly extracted from the zip file, are pre-processed, saved locally (in the cache) and uploaded to S3.

In [58]:
preprocess_and_save_docs('./opp_data/atti/testi.zip', 'tagged_acts_16', n_docs=100)

100%|██████████| 100/100 [00:09<00:00, 10.50it/s]


Saving data into ./datasets/tagged_acts_16.npz
Uploading to s3://opp-datasets/tagged_acts_16.npz
Saving vocab into ./datasets/tagged_acts_16_vocab.json
Uploading to s3://opp-datasets/tagged_acts_16_vocab.json


(('https://opp-datasets.s3.eu-central-1.amazonaws.com/tagged_acts_16.npz',
  '6bbcb04434449fad1581aeee2e299698'),
 ('https://opp-datasets.s3.eu-central-1.amazonaws.com/tagged_acts_16_vocab.json',
  '7cdab8ecd73d861b5f8e550c6a9e845e'))

### Testing .npz loading and usage
The ``.npz`` file is loaded and a random item is shown

In [65]:
import sys
sys.path.append('..')
from op.datasets import opp_tagged_acts

(train_texts_raw, train_labels_raw), (test_texts_raw, test_labels_raw) = opp_tagged_acts.load_data_16()
vocab = opp_tagged_acts.get_vocab()

In [67]:
i = random.choice(range(len(train_texts_raw)))
train_raw[i], train_labels[i]

("Atto Camera Ordine del Giorno 9/05389/153presentato daRUBINATO Simonettatesto diMartedì 7 agosto 2012, seduta n. 678 La Camera, premesso che: il provvedimento in esame ha un contenuto estremamente vasto e complesso con norme orientate a favorire la riduzione della spesa pubblica; vi è straordinaria necessità e urgenza di provvedere a dare effettività alle norme in materia di limite massimo retributivo per emolumenti o retribuzioni nell'ambito di rapporti di lavoro dipendente o autonomo con le pubbliche amministrazioni statali, e, in generale, di limite al trattamento economico annuo onnicomprensivo di chiunque riceva a carico delle finanze pubbliche emolumenti o retribuzioni nell'ambito di rapporti di lavoro dipendente o autonomo, ivi incluso il personale in regime di diritto pubblico di cui all'articolo 3 del decreto legislativo 30 marzo 2001, n. 165, stabilendo come parametro massimo di riferimento il trattamento economico del primo presidente della Corte di cassazione; a norma del

In [68]:
vocab.most_common(10)

[('n', 535),
 ('comma', 511),
 ('legge', 457),
 ('stato', 428),
 ('essere', 297),
 ('ministro', 284),
 ('ministero', 242),
 ('decreto', 218),
 ('caso', 216),
 ('tale', 207)]

In [147]:
# the top n_most_common terms are removed from the vocab and the words
n_most_common = 10
most_common = [k for k, v in vocab.most_common(n_most_common)]
vocab = dict([(k, c) for k,c in vocab.items() if k not in most_common])
for doc in data:
    doc['words'] = " ".join([w for w in doc['words'].split() if w in vocab])

In [45]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
x = tokenizer.texts_to_matrix(lines, mode='binary')
y = np.array(labels)

In [46]:
x

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 1., 1., 1.]])

In [48]:
np.savez_compressed('./datasets/tagged_acts', x=x, y=y)

In [49]:
loaded = np.load('./datasets/tagged_acts.npz', allow_pickle=True)
xs = loaded['x']
labels = loaded['y']

NameError: name '_hash_file' is not defined

In [56]:
_hash_file('./datasets/tagged_acts_tags.csv', algorithm='md5')

'db3cfb1c873a3c4cb2fe3022f3e68e3e'

In [11]:
from keras.preprocessing.text import text_to_word_sequence, one_hot, hashing_trick

In [12]:
size = len(set(text_to_word_sequence(texts[0])))