In [1]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import sys
import scipy

import pickle
import csv
import re
import nltk


In [2]:
# Helper functions

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    try:
        with open(name + '.pkl', 'rb') as f:
            return pickle.load(f)    
    except FileNotFoundError as e:
        return False;

In [3]:
# Preprocessing as in Law2Vec approach (https://archive.org/details/Law2Vec)
from nltk.tokenize import word_tokenize
# nltk.download('punkt')

def noun_verb_extraction(text):
    pos_tag = nltk.pos_tag(text.split())
    pos_tagged_noun_verb = []
    for word, tag in pos_tag:
        if tag == "NN" or tag == "NNP" or tag == "NNS" or tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            pos_tagged_noun_verb.append(word)
    return pos_tagged_noun_verb

def preprocess_sentence(text):
#     text = re.sub(r'[^a-zA-Z0-9\-\'\s]', '', str(text))
#     text = re.sub(r'\'', ' \'', text)
#     text = re.sub(r'\d+', '', text)
    text = word_tokenize(text)
#     text = noun_verb_extraction(text)
    text = [ word.lower() for word in text] # word.lower() if word != 'D' else word for word in text.split()
    text = [ re.sub(r'\d', 'D', word) for word in text]
    return text

In [4]:
# APPROACH 1: All parts of document to one vector

concepts = []      
dict = load_obj('helper_objects/dict_concept_filtered')
concepts = np.array(list(dict.keys()))
concepts = np.array(concepts)
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(concepts.reshape(len(concepts), 1))

documents_list = []
documents_concepts = [] 

with open('../data/dataset/dataset_filtered.csv', 'r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter=',')
    header = next(reader)
    print(header)
    counter = 0
    for row in reader:
#         combined = "\n".join([row[0], row[1], row[2], row[3], row[4]])
        combined = "\n".join([row[0], row[1], row[2]])
#         preprocessed = list(set(preprocess_sentence(combined)))
        preprocessed = preprocess_sentence(combined)
        documents_list.append(' '.join(preprocessed))
        c = np.array(eval(row[5]))
        # Combine vectorized concepts to one array.
        concept_vector_to_process = np.array([])
        temp = onehot_encoder.transform(c.reshape(len(c), 1)).toarray()
        for concept_vector in temp:
            concept_vector_to_process = concept_vector.astype(int) | concept_vector_to_process.astype(int) if len(concept_vector_to_process) > 0 else concept_vector
        documents_concepts.append(concept_vector_to_process)
        
        if counter % 500 == 0:
            print("STIGAO DO " + str(counter))
        counter = counter + 1


['title', 'header', 'recitals', 'main_body', 'attachments', 'concepts']
STIGAO DO 0
STIGAO DO 500
STIGAO DO 1000
STIGAO DO 1500
STIGAO DO 2000
STIGAO DO 2500
STIGAO DO 3000
STIGAO DO 3500
STIGAO DO 4000
STIGAO DO 4500
STIGAO DO 5000
STIGAO DO 5500
STIGAO DO 6000
STIGAO DO 6500
STIGAO DO 7000
STIGAO DO 7500
STIGAO DO 8000
STIGAO DO 8500
STIGAO DO 9000
STIGAO DO 9500
STIGAO DO 10000
STIGAO DO 10500
STIGAO DO 11000
STIGAO DO 11500
STIGAO DO 12000
STIGAO DO 12500
STIGAO DO 13000
STIGAO DO 13500
STIGAO DO 14000
STIGAO DO 14500
STIGAO DO 15000
STIGAO DO 15500
STIGAO DO 16000
STIGAO DO 16500
STIGAO DO 17000
STIGAO DO 17500
STIGAO DO 18000
STIGAO DO 18500
STIGAO DO 19000
STIGAO DO 19500
STIGAO DO 20000
STIGAO DO 20500
STIGAO DO 21000
STIGAO DO 21500
STIGAO DO 22000
STIGAO DO 22500
STIGAO DO 23000
STIGAO DO 23500
STIGAO DO 24000


In [14]:
# APPROACH 2: Each part of document in separate vector

concepts = []      
dict = load_obj('helper_objects/dict_concept_filtered')
concepts = np.array(list(dict.keys()))
concepts = np.array(concepts)
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(concepts.reshape(len(concepts), 1))

documents_list = []
documents_concepts = [] 

with open('../data/dataset/dataset_filtered.csv', 'r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter=',')
    header = next(reader)
    print(header)
    counter = 0
    for row in reader:
        doc_parts = []
#         doc_parts.append(' '.join(list(set(preprocess_sentence(row[0])))))
#         doc_parts.append(' '.join(list(set(preprocess_sentence(row[1])))))
        doc_parts.append(' '.join(preprocess_sentence(row[0])))
        doc_parts.append(' '.join(preprocess_sentence(row[1])))
        doc_parts.append(' '.join(preprocess_sentence(row[2])))

        documents_list.append(doc_parts)
        c = np.array(eval(row[5]))
        # Combine vectorized concepts to one array.
        concept_vector_to_process = np.array([])
        temp = onehot_encoder.transform(c.reshape(len(c), 1)).toarray()
        for concept_vector in temp:
            concept_vector_to_process = concept_vector.astype(int) | concept_vector_to_process.astype(int) if len(concept_vector_to_process) > 0 else concept_vector
        documents_concepts.append(concept_vector_to_process)
        if counter % 500 == 0:
            print("STIGAO DO " + str(counter))
        counter = counter + 1


['title', 'header', 'recitals', 'main_body', 'attachments', 'concepts']
STIGAO DO 0
STIGAO DO 500
STIGAO DO 1000
STIGAO DO 1500
STIGAO DO 2000
STIGAO DO 2500
STIGAO DO 3000
STIGAO DO 3500
STIGAO DO 4000
STIGAO DO 4500
STIGAO DO 5000
STIGAO DO 5500
STIGAO DO 6000
STIGAO DO 6500
STIGAO DO 7000
STIGAO DO 7500
STIGAO DO 8000
STIGAO DO 8500
STIGAO DO 9000
STIGAO DO 9500
STIGAO DO 10000
STIGAO DO 10500
STIGAO DO 11000
STIGAO DO 11500
STIGAO DO 12000
STIGAO DO 12500
STIGAO DO 13000
STIGAO DO 13500
STIGAO DO 14000
STIGAO DO 14500
STIGAO DO 15000
STIGAO DO 15500
STIGAO DO 16000
STIGAO DO 16500
STIGAO DO 17000
STIGAO DO 17500
STIGAO DO 18000
STIGAO DO 18500
STIGAO DO 19000
STIGAO DO 19500
STIGAO DO 20000
STIGAO DO 20500
STIGAO DO 21000
STIGAO DO 21500
STIGAO DO 22000
STIGAO DO 22500
STIGAO DO 23000
STIGAO DO 23500
STIGAO DO 24000


In [5]:
# TFIDF vectorization needed in 2 aproaches

from sklearn.feature_extraction.text import TfidfVectorizer 

print("TFIDF poceo")
tfidf_vectorizer=TfidfVectorizer() 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(documents_list)
print("TFIDF gotov")

TFIDF poceo
TFIDF gotov


In [7]:
# Law2Vec word embeddings with 200 dimensions

from gensim.models import KeyedVectors

f = open('Law2Vec/Law2Vec.200d.txt', encoding='utf-8-sig')
model = KeyedVectors.load_word2vec_format(f, binary=False)

In [11]:
# APPROACH 1 + TFIDF

word2vec_vectors = []
counter = 0
for i, words in enumerate(documents_list):
    if counter % 500 == 0:
        print("STIGAO DO " + str(counter))
    counter = counter + 1
    vectorized_sentence = 0
    word_list = words.split()
    temp = tfidf_vectorizer_vectors[i].toarray()[0]
    for word in word_list:
        try:
            vectorized_sentence = vectorized_sentence + ( model[word] * temp[tfidf_vectorizer.vocabulary_[word]] )
        except:
            continue
    word2vec_vectors.append(vectorized_sentence)

STIGAO DO 0
STIGAO DO 500
STIGAO DO 1000
STIGAO DO 1500
STIGAO DO 2000
STIGAO DO 2500
STIGAO DO 3000
STIGAO DO 3500
STIGAO DO 4000
STIGAO DO 4500
STIGAO DO 5000
STIGAO DO 5500
STIGAO DO 6000
STIGAO DO 6500
STIGAO DO 7000
STIGAO DO 7500
STIGAO DO 8000
STIGAO DO 8500
STIGAO DO 9000
STIGAO DO 9500
STIGAO DO 10000
STIGAO DO 10500
STIGAO DO 11000
STIGAO DO 11500
STIGAO DO 12000
STIGAO DO 12500
STIGAO DO 13000
STIGAO DO 13500
STIGAO DO 14000
STIGAO DO 14500
STIGAO DO 15000
STIGAO DO 15500
STIGAO DO 16000
STIGAO DO 16500
STIGAO DO 17000
STIGAO DO 17500
STIGAO DO 18000
STIGAO DO 18500
STIGAO DO 19000
STIGAO DO 19500
STIGAO DO 20000
STIGAO DO 20500
STIGAO DO 21000
STIGAO DO 21500
STIGAO DO 22000
STIGAO DO 22500
STIGAO DO 23000
STIGAO DO 23500
STIGAO DO 24000


In [15]:
# APPROACH 2 + TFIDF

word2vec_vectors = []

counter = 0
for i, doc in enumerate(documents_list):
    if counter % 500 == 0:
        print("STIGAO DO " + str(counter))
    counter = counter + 1
    doc_parts = []
    temp = tfidf_vectorizer_vectors[i].toarray()[0]
    for part in doc:
        vectorized_sentence = np.zeros(200) 
        word_list = part.split()
        for word in word_list:
            try:
                vectorized_sentence = vectorized_sentence + ( model[word] * temp[tfidf_vectorizer.vocabulary_[word]] )
            except:
                continue
        try:
            doc_parts.append(vectorized_sentence)
        except:
            continue
    word2vec_vectors.append(np.asarray(doc_parts).flatten())


STIGAO DO 0
STIGAO DO 500
STIGAO DO 1000
STIGAO DO 1500
STIGAO DO 2000
STIGAO DO 2500
STIGAO DO 3000
STIGAO DO 3500
STIGAO DO 4000
STIGAO DO 4500
STIGAO DO 5000
STIGAO DO 5500
STIGAO DO 6000
STIGAO DO 6500
STIGAO DO 7000
STIGAO DO 7500
STIGAO DO 8000
STIGAO DO 8500
STIGAO DO 9000
STIGAO DO 9500
STIGAO DO 10000
STIGAO DO 10500
STIGAO DO 11000
STIGAO DO 11500
STIGAO DO 12000
STIGAO DO 12500
STIGAO DO 13000
STIGAO DO 13500
STIGAO DO 14000
STIGAO DO 14500
STIGAO DO 15000
STIGAO DO 15500
STIGAO DO 16000
STIGAO DO 16500
STIGAO DO 17000
STIGAO DO 17500
STIGAO DO 18000
STIGAO DO 18500
STIGAO DO 19000
STIGAO DO 19500
STIGAO DO 20000
STIGAO DO 20500
STIGAO DO 21000
STIGAO DO 21500
STIGAO DO 22000
STIGAO DO 22500
STIGAO DO 23000
STIGAO DO 23500
STIGAO DO 24000


In [8]:
# APPROACH 1: All parts of document to one vector

word2vec_vectors = []

counter = 0
for i, words in enumerate(documents_list):
    if counter % 500 == 0:
        print("STIGAO DO " + str(counter))
    counter = counter + 1
    vectorized_sentence = 0
    word_list = words.split()
    for word in word_list:
        try:
            vectorized_sentence = vectorized_sentence + ( model[word] / len(word_list) )
        except:
            continue
    word2vec_vectors.append(vectorized_sentence)

STIGAO DO 0
STIGAO DO 500
STIGAO DO 1000
STIGAO DO 1500
STIGAO DO 2000
STIGAO DO 2500
STIGAO DO 3000
STIGAO DO 3500
STIGAO DO 4000
STIGAO DO 4500
STIGAO DO 5000
STIGAO DO 5500
STIGAO DO 6000
STIGAO DO 6500
STIGAO DO 7000
STIGAO DO 7500
STIGAO DO 8000
STIGAO DO 8500
STIGAO DO 9000
STIGAO DO 9500
STIGAO DO 10000
STIGAO DO 10500
STIGAO DO 11000
STIGAO DO 11500
STIGAO DO 12000
STIGAO DO 12500
STIGAO DO 13000
STIGAO DO 13500
STIGAO DO 14000
STIGAO DO 14500
STIGAO DO 15000
STIGAO DO 15500
STIGAO DO 16000
STIGAO DO 16500
STIGAO DO 17000
STIGAO DO 17500
STIGAO DO 18000
STIGAO DO 18500
STIGAO DO 19000
STIGAO DO 19500
STIGAO DO 20000
STIGAO DO 20500
STIGAO DO 21000
STIGAO DO 21500
STIGAO DO 22000
STIGAO DO 22500
STIGAO DO 23000
STIGAO DO 23500
STIGAO DO 24000


In [18]:
# APPROACH 2: Each part of document in separate vector

word2vec_vectors = []

counter = 0
for i, doc in enumerate(documents_list):
    if counter % 500 == 0:
        print("STIGAO DO " + str(counter))
    counter = counter + 1
    doc_parts = []
    for part in doc:
        vectorized_sentence = np.zeros(200) 
        word_list = part.split()
        for word in word_list:
            try:
                vectorized_sentence = vectorized_sentence + ( model[word] / len(word_list) )
            except:
                continue
        try:
            doc_parts.append(vectorized_sentence)
        except:
            continue
    word2vec_vectors.append(np.asarray(doc_parts).flatten())

STIGAO DO 0
STIGAO DO 500
STIGAO DO 1000
STIGAO DO 1500
STIGAO DO 2000
STIGAO DO 2500
STIGAO DO 3000
STIGAO DO 3500
STIGAO DO 4000
STIGAO DO 4500
STIGAO DO 5000
STIGAO DO 5500
STIGAO DO 6000
STIGAO DO 6500
STIGAO DO 7000
STIGAO DO 7500
STIGAO DO 8000
STIGAO DO 8500
STIGAO DO 9000
STIGAO DO 9500
STIGAO DO 10000
STIGAO DO 10500
STIGAO DO 11000
STIGAO DO 11500
STIGAO DO 12000
STIGAO DO 12500
STIGAO DO 13000
STIGAO DO 13500
STIGAO DO 14000
STIGAO DO 14500
STIGAO DO 15000
STIGAO DO 15500
STIGAO DO 16000
STIGAO DO 16500
STIGAO DO 17000
STIGAO DO 17500
STIGAO DO 18000
STIGAO DO 18500
STIGAO DO 19000
STIGAO DO 19500
STIGAO DO 20000
STIGAO DO 20500
STIGAO DO 21000
STIGAO DO 21500
STIGAO DO 22000
STIGAO DO 22500
STIGAO DO 23000
STIGAO DO 23500
STIGAO DO 24000


In [19]:
print(type(word2vec_vectors))
print(type(scipy.sparse.csr_matrix(word2vec_vectors)))


<class 'list'>
<class 'scipy.sparse.csr.csr_matrix'>


In [20]:
# Saving vectors

scipy.sparse.save_npz("vectors/vectors_word2vec_app2_title_header_recitals.npz", scipy.sparse.csr_matrix(word2vec_vectors))
scipy.sparse.save_npz("vectors/concepts_word2vec_app2_title_header_recitals.npz", scipy.sparse.csr_matrix(documents_concepts))

In [2]:
# Test if keras using gpu (it does automatically if device_type: "GPU" is listed as one of devices) - CUDA drivers are needed, tensorflow_gpu also
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4671833754815799383
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3136579175
locality {
  bus_id: 1
  links {
  }
}
incarnation: 12100942076577396788
physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [28]:
# Data oversampling
# Results of it are huge and impossible to use, ~20gb of graphic memory needed for training on such data

# import SMOTE module from imblearn library 
# pip install imblearn (if you don't have imblearn in your system) 
from skmultilearn.problem_transform import LabelPowerset
from imblearn.over_sampling import RandomOverSampler

# Import a dataset with X and multi-label y
lp = LabelPowerset()
ros = RandomOverSampler() # sampling_strategy='minority'

# Applies the above stated multi-label (ML) to multi-class (MC) transformation.
yt = lp.transform(y_train.todense())

X_resampled, y_resampled = ros.fit_resample(X_train.todense(), yt)

# Inverts the ML-MC transformation to recreate the ML set
y_resampled = lp.inverse_transform(y_resampled)

  
print('After OverSampling, the shape of train_X: {}'.format(X_resampled.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_resampled.shape)) 


After OverSampling, the shape of train_X: (19526, 600)
After OverSampling, the shape of train_y: (19526, 1289) 



In [50]:
from sklearn.model_selection import train_test_split
import scipy

X = scipy.sparse.load_npz("vectors/vectors_word2vec_app2_title_header_recitals.npz")
y = scipy.sparse.load_npz("vectors/concepts_word2vec_app2_title_header_recitals.npz")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)


(19361, 600)


In [46]:
# NEURAL NETWORK RELATED STUFF
# Research other loss and metric functions

import keras.backend.tensorflow_backend as tfb
from tensorflow.keras import layers, metrics
import tensorflow as tf

batch_size = 64
epochs = 20

POS_WEIGHT = 1.7  # multiplier for positive targets, needs to be tuned

def weighted_binary_crossentropy(target, output):
    """
    Weighted binary crossentropy between an output tensor 
    and a target tensor. POS_WEIGHT is used as a multiplier 
    for the positive targets.

    Combination of the following functions:
    * keras.losses.binary_crossentropy
    * keras.backend.tensorflow_backend.binary_crossentropy
    * tf.nn.weighted_cross_entropy_with_logits
    """
    ## NEEDED WHEN OVERSAMPLING 
    target = tf.cast(target, tf.float32)

    # transform back to logits
    _epsilon = tfb._to_tensor(tfb.epsilon(), output.dtype.base_dtype)
    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
    output = tf.math.log(output / (1 - output))
    # compute weighted loss
    loss = tf.compat.v1.nn.weighted_cross_entropy_with_logits(labels=target,
                                                    logits=output,
                                                    pos_weight=POS_WEIGHT)
    return tf.reduce_mean(loss, axis=-1)

def define_model_architecture(input_dimension, output_dimension):
    model = tf.keras.Sequential()
    model.add(layers.Dense(units=1024, input_shape=(input_dimension,)))
    model.add(layers.Activation('selu'))
#     model.add(layers.Dropout(0.1))
#     model.add(layers.Dense(units=516))
#     model.add(layers.BatchNormalization())
#     model.add(layers.Activation('relu'))
#     model.add(layers.Dropout(0.1))
#     model.add(layers.Dense(units=256))
#     model.add(layers.BatchNormalization())
#     model.add(layers.Activation('relu'))
#     model.add(layers.Dropout(0.05))
    model.add(layers.Dense(output_dimension, activation='sigmoid'))
    model.compile(loss=weighted_binary_crossentropy, optimizer="adam", #'binary_crossentropy'
                  metrics=[metrics.top_k_categorical_accuracy])
    return model

In [47]:
nn_model = define_model_architecture(X_train.shape[1], y_train.shape[1])
print("DEFINED")
history = nn_model.fit(X_train.todense(), y_train.todense(), validation_split=0.13, epochs=epochs)
print("TRAINED")

DEFINED
Train on 16844 samples, validate on 2517 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
TRAINED


In [48]:
import numpy as np
predict = nn_model.predict(X_test[61].todense())
t = predict > 0.5
print(np.where(t))
print(y_test[61])

(array([0, 0, 0, 0, 0], dtype=int64), array([ 391,  774,  788,  803, 1274], dtype=int64))
  (0, 788)	1.0
  (0, 1096)	1.0
  (0, 1274)	1.0


In [49]:
import numpy
from sklearn.metrics import classification_report
y_pred = nn_model.predict(X_test.todense(), batch_size=64, verbose=1)
y_pred = (y_pred > 0.5)
print("Testing documents: " + str(X_test.shape[0]))
print("Concepts in test data: " + str(len(numpy.unique(numpy.ravel(numpy.argmax((y_test.todense() > 0.5) , axis=1))))))
y_pred_bool = numpy.argmax(y_pred, axis=1)
print("Concepts predicted: " + str(len(numpy.unique(y_pred_bool))))

print(classification_report(y_test, y_pred, zero_division=0)) # , labels=numpy.unique(y_pred_bool)

Testing documents: 4841
Concepts in test data: 634
Concepts predicted: 549
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.41      0.44      0.42        68
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00         1
           4       0.67      1.00      0.80         2
           5       0.00      0.00      0.00         4
           6       0.58      0.27      0.37        41
           7       0.50      0.14      0.22         7
           8       0.50      0.25      0.33        24
           9       0.50      0.33      0.40         6
          10       0.00      0.00      0.00         4
          11       0.93      0.96      0.94        26
          12       0.89      0.73      0.80        11
          13       0.67      0.67      0.67         3
          14       0.94      0.71      0.81        24
          15       0.73      0.59      0.65        27
      