In [None]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def noun_verb_extraction(text):
    pos_tag = nltk.pos_tag(text.split())
    pos_tagged_noun_verb = []
    for word, tag in pos_tag:
        if tag == "NN" or tag == "NNP" or tag == "NNS" or tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            pos_tagged_noun_verb.append(word)
    return pos_tagged_noun_verb

def preprocess_sentence(text):
    text = bytes(text, 'utf-8').decode('utf-8','ignore')
    text = word_tokenize(text)
    text = [word.lower() for word in text] # word.lower() if word != 'D' else word for word in text.split()
    text = [re.sub(r'\d+', 'D', word) for word in text]
    return text

In [None]:
import csv
df = pd.read_csv('../data/dataset/dataset_filtered.csv', encoding='utf-8') 
df.head()

Unnamed: 0,title,header,recitals,main_body,attachments,concepts
0,Commission Decision on a modification of the s...,COMMISSION DECISION on a modification of the ...,",\nHaving regard to the Treaty establishing th...","Article 1\nThe Italian draft law No 2395, amen...","Done at Strasbourg, 8 March 1961.\nFor the Com...","['1519', '155', '3071', '3165']"
1,EEC: Regulation No 27 of the Commission: First...,REGULATION No 27 OF THE COMMISSION First Regul...,",\nHaving regard to the provisions of the Trea...",Persons entitled to submit applications and no...,"Done at Brussels, 3 May 1962.\nFor the Commiss...","['2474', '3581', '3821', '539']"
2,EEC: Regulation No 141 of the Council exemptin...,REGULATION No 141 OF THE COUNCIL exempting tra...,",\nHaving regard to the Treaty establishing th...",Regulation No 17 shall not apply to agreements...,"Done at Paris, 26 November 1962.\nFor the Coun...","['1025', '2474', '2494', '3160', '539']"
3,63/9/EEC: Council Decision of 14 May 1962 dete...,COUNCIL DECISION of 14 May 1962 determining ...,",\nHaving regard to Article 2 of the Staff Reg...",The powers conferred by the Staff Regulations ...,"Done at Brussels, 14 May 1962.\nFor the Counci...","['4178', '4271']"
4,Regulation No 99/63/EEC of the Commission of 2...,REGULATION No 99/63/EEC OF THE COMMISSION of ...,",\nHaving regard to the Treaty establishing th...",Before consulting the Advisory Committee on Re...,"Done at Brussels, 25 July 1963.\nFor the Commi...","['2474', '2695', '3821', '5334', '539', '6050']"


In [None]:
from sklearn.preprocessing import OneHotEncoder
import pickle
def load_obj(name ):
    try:
        with open(name + '.pkl', 'rb') as f:
            return pickle.load(f)    
    except FileNotFoundError as e:
        return False;


dict = load_obj('helper_objects/dict_concept_filtered')
concepts = np.array(list(dict.keys()))
concepts = np.array(concepts)
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(concepts.reshape(len(concepts), 1))        

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [None]:
import csv
# APPROACH 1: All parts of document to one vector

documents_list = []
documents_concepts = [] 

with open('../data/dataset/dataset_filtered.csv', 'r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter=',')
    header = next(reader)
    print(header)
    counter = 0
    
    for row in reader:
        combined = " ".join([row[0], row[1], row[2]])
        preprocessed = preprocess_sentence(combined)
        documents_list.append(' '.join(preprocessed))
        c = np.array(eval(row[5]))
        # Combine vectorized concepts to one array.
        concept_vector_to_process = np.array([])
        temp = onehot_encoder.transform(c.reshape(len(c), 1)).toarray()
        for concept_vector in temp:
            concept_vector_to_process = concept_vector.astype(int) | concept_vector_to_process.astype(int) if len(concept_vector_to_process) > 0 else concept_vector
        documents_concepts.append(concept_vector_to_process)
        
        if counter % 500 == 0:
            print("STIGAO DO " + str(counter))
        counter = counter + 1

In [None]:
from pandas import DataFrame
print(len(documents_concepts))
df_labels = DataFrame (documents_concepts)
print(df_labels.shape)
cols = df_labels.columns
label_cols = list(cols[0:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)
df_labels = df_labels[label_cols]
df_labels.head()


24202
(24202, 1289)
Label columns:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1249,1250,1251,1252,1253,1254,1255,1256,1257,1258,1259,1260,1261,1262,1263,1264,1265,1266,1267,1268,1269,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280,1281,1282,1283,1284,1285,1286,1287,1288
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
labels = df_labels.values

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents_list, labels, test_size=0.20, random_state=42)

In [None]:
tokenizer = Tokenizer(lower=False, filters="", num_words=32626)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 727

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
from gensim.models import KeyedVectors

f = open('Law2Vec/Law2Vec.200d.txt', encoding='utf-8-sig')
model = KeyedVectors.load_word2vec_format(f, binary=False)

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = model
exceptions = []
embedding_matrix = zeros((vocab_size, 200))
cnt = 0
for word, index in tokenizer.word_index.items():
  try:
    embedding_vector = embeddings_dictionary[word]
    embedding_matrix[index] = embedding_vector
  except:  
    cnt = cnt + 1
    exceptions.append(word)     

In [None]:
from tensorflow.keras import layers, metrics
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(600, dropout=0.05)(embedding_layer)
dense_layer_1 = Dense(num_labels, activation='sigmoid')(LSTM_Layer_1)
lstmModel = Model(inputs=deep_inputs, outputs=dense_layer_1)
lstmModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics.top_k_categorical_accuracy)

In [None]:
print(lstmModel.summary())

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 727)]             0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 727, 200)          6525400   
_________________________________________________________________
lstm_7 (LSTM)                (None, 600)               1922400   
_________________________________________________________________
dense_6 (Dense)              (None, 1289)              774689    
Total params: 9,222,489
Trainable params: 2,697,089
Non-trainable params: 6,525,400
_________________________________________________________________
None


In [None]:
print(X_train[0])
print(y_train[0])

In [None]:
history = lstmModel.fit(X_train, y_train, batch_size=512, epochs=20, verbose=1, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
score = lstmModel.evaluate(X_test, y_test, verbose=1)
y_pred = lstmModel.predict(X_test, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.021145585924386978
Test Accuracy: 0.06589547544717789


In [None]:
from sklearn.metrics import classification_report

threshold = 0.05
pred_bools = [pl>threshold for pl in y_pred]

true_bools = [tl==1 for tl in y_test]
print(classification_report(true_bools, pred_bools, zero_division=0))
