### Libraries

In [2]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import LSTM, GRU, Bidirectional
from tensorflow.keras.optimizers import SGD, RMSprop, Adadelta, Adam
from tensorflow.keras.preprocessing import text, sequence
import numpy as np
import pandas as pd
import io
import csv
from sklearn import preprocessing, metrics
from sklearn.preprocessing import MultiLabelBinarizer
import re
import nltk
from nltk.corpus import stopwords
stopwords_dict=set(stopwords.words("english-v2-uspto-sklearn"))

### Help functions

In [3]:
def load_patents_text(part, number_of_words, file1):

#This function loads the text and the labels from a csv file 

# part: the textual field that we will use 
# number_of_words: the first number of words that we will use
# file1: the csv file with the dataset containing all textual fields and labels
        
    trainDF = pd.read_csv(file1, header=None)
    
    #labels
    trainDF=trainDF.rename(columns={8: 'labels'})
    
    #text
    if part==1:  #title
        trainDF=trainDF.rename(columns={3: 'text'})
    elif part==2:  #header
        trainDF=trainDF.rename(columns={4: 'text'})
    elif part==3:  #recitals
        trainDF=trainDF.rename(columns={5: 'text'})
    elif part==4:  #main_body
        trainDF=trainDF.rename(columns={6: 'text'})
    elif part==5:  #attachments
        trainDF=trainDF.rename(columns={7: 'text'})
    
    #replace the na rows with "" otherwise it returns an error
    trainDF['text']=trainDF['text'].fillna("")
    #delete all symbols except for a-z
    trainDF['text']=trainDF['text'].replace('[^a-z]', ' ', regex=True)       
    #delete the stopwords
    trainDF['text']=trainDF['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_dict)]))
    #keep the first #number of words
    trainDF['text']=trainDF['text'].str.split().str[0:number_of_words]
    trainDF['text']=trainDF['text'].str.join(' ')    
    print("The data has been loaded! \n")
      
    return trainDF

def encode_multilabels(trainDF, trainDF_test):

#This function encodes the labels with MultiLabelBinarizer

# trainDF: the train dataset  
# trainDF_test: the test dataset

    #get the labels from the train csv
    labels_val=trainDF['labels']
    labels_val = labels_val.str.split(',')
    
    #get the labels from the text csv
    labels_val_test=trainDF_test['labels']
    labels_val_test = labels_val_test.str.split(',')
    
    #concat train and text labels
    labels_all=pd.concat([labels_val,labels_val_test])
    print("The number of train labels, test labels and all labels are:", labels_val.shape[0], labels_val_test.shape[0], labels_all.shape[0])
    
    multihop_encoder = MultiLabelBinarizer()
    multihop_encoded_original = multihop_encoder.fit_transform(labels_all)
    
    #split the encoded labels into train and test sets
    multihop_encoded=multihop_encoded_original[0:labels_val.shape[0], :]
    multihop_encoded_test=multihop_encoded_original[labels_val.shape[0]:labels_all.shape[0], :]    
    print("The labels have been encoded! \n")

    return multihop_encoder, multihop_encoded, multihop_encoded_test

def enumarate_codes(onehot_encoded):

#This function encounters the total number of labels

    number_of_codes=np.shape(onehot_encoded)
    number_of_codes=number_of_codes[1]
    print("Number of labels: ", number_of_codes, "\n")
    
    return number_of_codes

def tokenize_text(trainDF):

#This function tokenizes the text

    token = text.Tokenizer()
    token.fit_on_texts(trainDF['text'])
    word_index = token.word_index
    print('Number of unique words:',len(word_index), "\n")
    
    return token, word_index

def convert_text(number_of_words, token, x):

# This function converts the text to sequence of tokens and pad them till maxlen to ensure equal length vectors

    maxlen=number_of_words

    seq_x = sequence.pad_sequences(token.texts_to_sequences(x), maxlen)
    print("The text has been converted to tokens! \n")

    return seq_x

def load_language_model(fname):

# This function loads the language model

    embeddings_dict = {}
    with open(fname, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    print("The word embeddings have been loaded! \n")

    return embeddings_dict

def create_embedding_matrix(embeddings_index, word_index, x):

# This function creates a token-embedding matrix

    num_words=len(word_index) + 1
    embedding_matrix = np.zeros((num_words, x))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix, num_words


def create_bidirectional_lstm_probabilities_kullback(maxlen, num_words, number_of_codes, embedding_matrix):
    
# This function creates the classification model based on Bi-LSTM and KL loss

    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))
    # Add the Embedding Layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)
    # Add the SpatialDropout1D Layer
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)
    # Add a Bidirectional Layer
    lstm_layer = layers.Bidirectional(layers.LSTM(100, recurrent_dropout=0.1, dropout=0.1))(embedding_layer)                                                                                        
    # Add the Output Layer
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
                                                                                            
    model.compile(optimizer='Adam', loss='kullback_leibler_divergence', metrics=['accuracy'])

    model.summary()
    
    return model

def make_predictions(test_seq_x, test_y, classifier):

# This function makes the predictions in the test data

    predictions = classifier.predict(test_seq_x)
    prediction = np.argmax(predictions, axis = -1) 
    y_true = np.argmax(test_y,axis = -1)
    print('The predictions on test data have been calculated! \n')

    return predictions, prediction, y_true                                                                                           
                                                                                            
def kill_model():
    try:
        K.clear_session()
        del model
    except:
        print('No model to clear \n')

### Special functions 

In [4]:
def calculate_probabilities(multihop_encoded_train):

# This function transforms the multilabel encoding into probabilities, e.g. 1 1 0 0 -> 0.5 0.5 0 0  
    
    a = np.zeros((multihop_encoded_train.shape))

    for i in range(len(multihop_encoded_train)):
        sum_of_secondary_codes=sum(multihop_encoded_train[i])
        #print(sum_of_secondary_codes)

        for j in range(len(multihop_encoded_train[i])):
            if multihop_encoded_train[i][j]==1:
                a[i][j]=float(1/sum_of_secondary_codes)
    print('The probabilities have been calculated! \n')

    return a

In [5]:
def calculate_metrix(number_of_test_samples, number_of_codes, predictions_kull, nn, multihop_encoded_test):

# This function calculates the precision, recall and f1 score metrics

    nn=nn+1
    pred_class_kull=np.empty((number_of_test_samples, number_of_codes))
    for row in range(number_of_test_samples):
        predictions_p1_sort2=np.argsort(predictions_kull[row])[:-nn:-1]
        class_number_zeros=np.zeros(number_of_codes)
        for class_number in predictions_p1_sort2:
            class_number_zeros[class_number]=1

        pred_class_kull[row][:]=class_number_zeros
    #print(pred_class_kull.shape)                      
    print(metrics.precision_score(multihop_encoded_test, pred_class_kull, average='micro')*100)
    print(metrics.recall_score(multihop_encoded_test, pred_class_kull, average='micro')*100)
    print(metrics.f1_score(multihop_encoded_test, pred_class_kull, average='micro')*100)                    
    
    return None

### Main code

In [6]:
number_of_words = [100]
epochs = 30
batch_size = 128
parts=[1, 2, 3, 4, 5] #1: title, 2: header, 3: recitals, 4: main_body, 5: attachments

for i, words in enumerate(number_of_words):
    for k, part in enumerate(parts):

        train_filename="/datasets/eurlex57k_train_multilabel.csv"
        test_filename="/datasets/eurlex57k_test_multilabel.csv"

        #load train data  
        train_DF=load_patents_text(part, words, train_filename)
        #load test data
        test_DF=load_patents_text(part, words, test_filename)

        multihop_encoder, multihop_encoded_train, multihop_encoded_test=encode_multilabels(train_DF, test_DF)
        number_of_codes=enumarate_codes(multihop_encoded_train)  
        token_p1, word_index_p1=tokenize_text(train_DF.append(test_DF))

        train_seq_x_p1 =convert_text(words, token_p1, train_DF['text'])    
        test_seq_x_p1 =convert_text(words, token_p1, test_DF['text'])

        a=calculate_probabilities(multihop_encoded_train)

        # load the glove language model
        if part==1:
            embeddings_index = load_language_model('/embeddings/glove.6B.300d.txt')
        embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1, 300)

        kill_model()
        classifier = create_bidirectional_lstm_probabilities_kullback(words, num_words_p1, number_of_codes, embedding_matrix_p1) 
        history=classifier.fit(train_seq_x_p1, a, epochs=epochs, batch_size=batch_size, verbose=1)           
        
        #Save the trained classifier
        #classifier.save("eurlex_part"+str(part)+"_30epoch")

        predictions_kull, prediction_kull, y_true_kull=make_predictions(test_seq_x_p1, multihop_encoded_test, classifier)
        
        number_of_test_data=np.shape(test_seq_x_p1)
        number_of_test_data=number_of_test_data[0]

        print("calculate the P@1, R@1 and F1@1")
        calculate_metrix(number_of_test_data, number_of_codes, predictions_kull, 1, multihop_encoded_test)
        print("calculate the P@5, R@5 and F1@5")
        calculate_metrix(number_of_test_data, number_of_codes, predictions_kull, 5, multihop_encoded_test)

        #Save the final predictions
        #df=pd.DataFrame(predictions_kull)
        #df.sort_values(by=0, axis=1, ascending=False)
        #file_name="eurlex_"+str(part)+"_"+str(words)+"_30Epoch.csv"
        #df.to_csv(file_name, header=False, index=False)
        
        #Save qrel
        #df=pd.DataFrame(multihop_encoded_test)
        #df.sort_values(by=0, axis=1, ascending=False)
        #file_name="eurlex_multihop_encoded_test.csv"
        #df.to_csv(file_name, header=False, index=False)

The data has been loaded! 

The data has been loaded! 

The number of train labels, test labels and all labels are: 45000 6000 51000
The labels have been encoded! 

Number of labels:  4193 

Number of unique words: 12388 

The text has been converted to tokens! 

The text has been converted to tokens! 

The probabilities have been calculated! 

The word embeddings has been loaded! 

No model to clear 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 300)          3716700   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 100, 300)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               320800    


Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
The predictions on test data have been calculated! 

(6000, 4193)
82.23333333333333
16.253788377915406
27.14269996699307
calculate_metrix-Done! 

(6000, 4193)
60.196666666666665
59.4907102385031
59.84160646828817
calculate_metrix-Done! 

The data has been loaded! 

The data has been loaded! 

The number of train labels, test labels and all labels are: 45000 6000 51000
The labels have been encoded! 

Number of labels:  4193 

Number of unique words: 48316 

The text has been converted to tokens! 

The text has been converted to tokens! 

The probabilities have been calculated! 

The word embeddings has been loaded! 

No model to clear 

_________________________________________________________________
Layer (type)               

### Load the stored predictions and qrel and create the ensemble of above classifiers

In [None]:
in1=pd.read_csv('eurlex_1_100_30Epoch.csv',header=None)
in2=pd.read_csv('eurlex_2_100_30Epoch.csv',header=None)
in3=pd.read_csv('eurlex_3_100_30Epoch.csv',header=None)
in4=pd.read_csv('eurlex_4_100_30Epoch.csv',header=None)
in5=pd.read_csv('eurlex_5_100_30Epoch.csv',header=None)

in1_predictions=in1.to_numpy()
in2_predictions=in2.to_numpy()
in3_predictions=in3.to_numpy()
in4_predictions=in4.to_numpy()
in5_predictions=in5.to_numpy()

in1_prediction = np.argmax(in1_predictions, axis = -1) 
in2_prediction = np.argmax(in2_predictions, axis = -1) 
in3_prediction = np.argmax(in3_predictions, axis = -1) 
in4_prediction = np.argmax(in4_predictions, axis = -1) 
in5_prediction = np.argmax(in5_predictions, axis = -1) 

In [None]:
true=pd.read_csv('eurlex_multihop_encoded_test.csv',header=None)
true=true.to_numpy()

In [None]:
number_of_test_data_p1=in1_predictions.shape[0] 
number_of_codes=in1_predictions.shape[1] 

In [None]:
def ensemble_predictions5(predictions_p1, predictions_p2,predictions_p3, predictions_p4, predictions_p5, number_of_test_data_p1, en):

#This functions calculates the averaging of predictions for each label

    average_predictions = []
    i=0
    
    for i in range(number_of_test_data_p1):
        a=np.mean([predictions_p1[i], predictions_p2[i], predictions_p3[i], predictions_p4[i], predictions_p5[i]], axis=0)
        average_predictions.append(a)
    
    average_predictions_2 = np.array(average_predictions)          
    average_prediction = np.argmax(average_predictions, axis = -1) 
    print('The ensemble predictions have been calculated! \n')

    return average_predictions_2, average_prediction

ensembles=5 #the number of base classifiers combined using an avaraging function
average_predictions, average_prediction=ensemble_predictions5(in1_predictions, in2_predictions, in3_predictions, in4_predictions, in5_predictions, number_of_test_data_p1, ensembles)

#calculate the P@1, R@1 and F1@1 for the ensemble of 5 classifiers
calculate_metrix(number_of_test_data_p1, number_of_codes, average_predictions, 1, true)
#calculate the P@5, R@5 and F1@5 for the ensemble of 5 classifiers
calculate_metrix(number_of_test_data_p1, number_of_codes, average_predictions, 5, true)