### Libraries

In [None]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import LSTM, GRU, Bidirectional
from tensorflow.keras.optimizers import SGD, RMSprop, Adadelta, Adam
from tensorflow.keras.preprocessing import text, sequence
import numpy as np
import pandas as pd
import io
import csv
from sklearn import model_selection, preprocessing, metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer
import re
import nltk
from nltk.corpus import stopwords
stopwords_dict=set(stopwords.words("english-v2-uspto-sklearn"))

### Help functions

In [2]:
def load_patents(number_of_words, file1):

#This function loads the text and the labels from a csv file 
# number_of_words: the first number of words that we will use
# file1: the csv file with the dataset containing all textual fields and labels

    trainDF = pd.read_csv(file1, header=None, usecols=[2,3])
    
    #labels
    trainDF=trainDF.rename(columns={3: 'labels'})
    
    #text    
    trainDF=trainDF.rename(columns={2: 'text'})
    
    #replace the na rows with "" otherwise it returns an error
    trainDF['text']=trainDF['text'].fillna("")
    #delete all symbols except for a-z
    trainDF['text']=trainDF['text'].replace('[^a-z]', ' ', regex=True)       
    #delete the stopwords
    trainDF['text']=trainDF['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_dict)]))
    #keep the first #number of words
    trainDF['text']=trainDF['text'].str.split().str[0:number_of_words]
    trainDF['text']=trainDF['text'].str.join(' ')
    print("The data has been loaded! \n")
   
    return trainDF

def load_main_target(file1):

#This function loads the main labels from a csv file (the main labels are needed for the evaluation)

    trainDF = pd.read_csv(file1, header=None, usecols=[1])
    trainDF=trainDF.rename(columns={1: 'label'})
    print("The main labels have been loaded! \n")
    
    return trainDF

def encode_multilabels(trainDF, trainDF_main_target, trainDF_test, trainDF_main_target_test):

#This function concats all the available labels from train and test sets (including main labels) 
# and encodes them with MultiLabelBinarizer. 
    
    #train    
    labels_val=trainDF['labels']
    labels_val = labels_val.str.split(',')
    
    labels_val_main_target=trainDF_main_target['label']
    labels_val_main_target = labels_val_main_target.str.split(',')
    
    labels_all=pd.concat([labels_val,labels_val_main_target])
    
    #test
    labels_val_test=trainDF_test['labels']
    labels_val_test = labels_val_test.str.split(',')
    
    labels_val_main_target_test=trainDF_main_target_test['label']
    labels_val_main_target_test = labels_val_main_target_test.str.split(',')
    
    labels_all_test=pd.concat([labels_val_test,labels_val_main_target_test])
    
    labels_all2=pd.concat([labels_all, labels_all_test])
    
    multihop_encoder = MultiLabelBinarizer()
    multihop_encoded_original = multihop_encoder.fit_transform(labels_all2)
         
    multihop_encoded=multihop_encoded_original[0:labels_all.shape[0], :]
    splitted=np.array_split(multihop_encoded, 2)
    
    multihop_encoded_test=multihop_encoded_original[labels_all.shape[0]:labels_all.shape[0]+labels_all_test.shape[0], :]
    splitted_test=np.array_split(multihop_encoded_test, 2)
    print("The labels have been encoded! \n")
    
    return multihop_encoder, splitted[0], splitted[1], splitted_test[0], splitted_test[1]

def enumarate_codes(onehot_encoded):

#This function encounters the total number of labels

    number_of_codes=np.shape(onehot_encoded)
    number_of_codes=number_of_codes[1]
    print("Number of labels: ", number_of_codes, "\n")
    
    return number_of_codes

def tokenize_text(trainDF):
        
#This function tokenizes the text

    token = text.Tokenizer()
    token.fit_on_texts(trainDF['text'])
    word_index = token.word_index
    print('Number of unique words:',len(word_index), "\n")
    
    return token, word_index

def convert_text(number_of_words, token, x):

# This function converts the text to sequence of tokens and pad them till maxlen to ensure equal length vectors

    maxlen=number_of_words

    seq_x = sequence.pad_sequences(token.texts_to_sequences(x), maxlen)
    print("The text has been converted to tokens! \n")

    return seq_x

def load_language_model(fname):
    data = {}
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())    
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    print("The word embeddings have been loaded! \n")
    
    return data

def create_embedding_matrix(embeddings_index, word_index):

# This function creates a token-embedding matrix

    num_words=len(word_index) + 1
    embedding_matrix = np.zeros((num_words, 300))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix, num_words

def create_bidirectional_lstm_probabilities_kullback(maxlen, num_words, number_of_codes, embedding_matrix):

# This function creates the classification model based on Bi-LSTM and KL loss

    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))

    # Add the Embedding Layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)
    # Add the SpatialDropout1D Layer
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)
    # Add a Bidirectional Layer
    lstm_layer = layers.Bidirectional(layers.LSTM(100, recurrent_dropout=0.1, dropout=0.1))(embedding_layer)
    # Add the Output Layer
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    
    model.compile(optimizer='Adam', loss='kullback_leibler_divergence', metrics=['accuracy'])

    model.summary()
    
    return model

def make_predictions(test_seq_x, test_y, classifier):

# This function makes the predictions in the test data

    predictions = classifier.predict(test_seq_x)
    prediction = np.argmax(predictions, axis = -1) 
    y_true = np.argmax(test_y,axis = -1)
    print('The predictions on test data have been calculated! \n')

    return predictions, prediction, y_true

def kill_model():
    try:
        K.clear_session()
        del model
    except:
        print('No model to clear \n')

### Special functions

In [3]:
def calculate_probabilities(multihop_encoded_train):

# This function transforms the multilabel encoding into probabilities, e.g. 1 1 0 0 -> 0.5 0.5 0 0  

    a = np.zeros((multihop_encoded_train.shape))

    for i in range(len(multihop_encoded_train)):
        sum_of_secondary_codes=sum(multihop_encoded_train[i])-1

        for j in range(len(multihop_encoded_train[i])):
            if multihop_encoded_train[i][j]==1:
                if sum_of_secondary_codes>0:
                    a[i][j]=float(0.4/sum_of_secondary_codes)
            if multihop_encoded_train_main_target[i][j]==1:
                if sum_of_secondary_codes>0:
                    a[i][j]=float(0.6)
                else:
                    a[i][j]=1

    print('The probabilities have been calculated! \n')

    return a

In [4]:
def calculate_metrix(number_of_test_samples, number_of_codes, predictions_kull, nn, multihop_encoded_test):

# This function calculates the precision, recall and f1 score metrics

    nn=nn+1
    pred_class_kull=np.empty((number_of_test_samples, number_of_codes))
    for row in range(number_of_test_samples):
        predictions_p1_sort2=np.argsort(predictions_kull[row])[:-nn:-1]
        class_number_zeros=np.zeros(number_of_codes)
        for class_number in predictions_p1_sort2:
            class_number_zeros[class_number]=1

        pred_class_kull[row][:]=class_number_zeros
    print(pred_class_kull.shape)                      
    print(metrics.precision_score(multihop_encoded_test, pred_class_kull, average='micro')*100)
    print(metrics.recall_score(multihop_encoded_test, pred_class_kull, average='micro')*100)
    print(metrics.f1_score(multihop_encoded_test, pred_class_kull, average='micro')*100)                    
    print('calculate_metrix-Done! \n')
    
    return None

### Main code

In [6]:
words = [60]
epochs = 30
batch_size = 128
parts=[1, 2, 3, 4] #1: title, 2: abstract, 3: description, 4: claims

file1= "/datasets/wipo_train_patents_had_level3_main_title_multilabel.csv"
file2= "/datasets/wipo_test_patents_had_level3_main_title_multilabel.csv"
file3= "/datasets/wipo_train_patents_had_level3_main_abstract_multilabel.csv"
file4= "/datasets/wipo_test_patents_had_level3_main_abstract_multilabel.csv"
file5= "/datasets/wipo_train_patents_had_level3_main_description_multilabel.csv"
file6= "/datasets/wipo_test_patents_had_level3_main_description_multilabel.csv"
file7= "/datasets/wipo_train_patents_had_level3_main_claims_multilabel.csv"
file8= "/datasets/wipo_test_patents_had_level3_main_claims_multilabel.csv"

for i, number_of_words in enumerate(words):
    for k, part in enumerate(parts):
    
        if part == 1:
            f1=file1
            f2=file2
        if part == 2:
            f1=file3
            f2=file4        
        if part == 3:
            f1=file5
            f2=file6        
        if part == 4:
            f1=file7
            f2=file8
            
        #train data   
        train_DF=load_patents(number_of_words,f1)
        train_DF_main_target = load_main_target(f1)

        #test data
        test_DF=load_patents(number_of_words,f2)
        test_DF_main_target = load_main_target(f2)

        multihop_encoder, multihop_encoded_train, multihop_encoded_train_main_target, \
        multihop_encoded_test, multihop_encoded_test_main_target=encode_multilabels\
        (train_DF, train_DF_main_target, test_DF, test_DF_main_target)

        number_of_codes=enumarate_codes(multihop_encoded_train)  

        combined=pd.concat([train_DF, test_DF])
        token_p1, word_index_p1=tokenize_text(combined)

        train_seq_x_p1 =convert_text(number_of_words, token_p1, train_DF['text'])
        test_seq_x_p1 =convert_text(number_of_words, token_p1, test_DF['text'])

        if part==1: 
            embeddings_index = load_language_model('/embeddings/patent-300.vec')
        embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1)

        a=calculate_probabilities(multihop_encoded_train)
        
        classifier = create_bidirectional_lstm_probabilities_kullback(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
        history3=classifier.fit(train_seq_x_p1, a, epochs=epochs, batch_size=128, verbose=1)

        predictions_kull, prediction_kull, y_true_kull=make_predictions(test_seq_x_p1, multihop_encoded_test, classifier)
        
        number_of_test_data=np.shape(test_seq_x_p1)
        number_of_test_data=number_of_test_data[0]
        
        print("calculate the P@1, R@1 and F1@1 vs all labels")
        calculate_metrix(number_of_test_data, number_of_codes, predictions_kull, 1, multihop_encoded_test)
        print("calculate the P@1, R@1 and F1@1 vs main label")
        calculate_metrix(number_of_test_data, number_of_codes, predictions_kull, 1, multihop_encoded_test_main_target)
        print("calculate the P@3, R@3 and F1@3 vs main label")
        calculate_metrix(number_of_test_data, number_of_codes, predictions_kull, 3, multihop_encoded_test_main_target)
        
        #Save the final predictions
        #df=pd.DataFrame(predictions_kull)
        #df.sort_values(by=0, axis=1, ascending=False)
        #df.to_csv("wipo_"+str(part)+"_"+str(words)+"_30Epoch.csv", header=False, index=False)   
        
        #Save qrels multilabel
        #df=pd.DataFrame(multihop_encoded_test)
        #df.sort_values(by=0, axis=1, ascending=False)
        #df.to_csv('multihop_encoded_test.csv', header=False, index=False)
        
        #Save qrels main label
        #df=pd.DataFrame(multihop_encoded_test_main_target)
        #df.sort_values(by=0, axis=1, ascending=False)
        #df.to_csv('multihop_encoded_test_main_target.csv', header=False, index=False)

The data has been loaded! 

The main labels have been loaded! 

The data has been loaded! 

The main labels have been loaded! 

The labels have been encoded! 

Number of labels:  633 

Number of unique words: 25606 

The text has been converted to tokens! 

The text has been converted to tokens! 

The probabilities have been calculated! 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 60)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 60, 300)           7682100   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 60, 300)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               320800    
_________________________________________________________________

ParserError: Error tokenizing data. C error: out of memory

### Load the stored predictions and qrel and create the ensemble of above classifiers

In [None]:
in1=pd.read_csv('wipo_1_60_30Epoch.csv',header=None)
in2=pd.read_csv('wipo_2_60_30Epoch.csv',header=None)
in3=pd.read_csv('wipo_3_60_30Epoch.csv',header=None)
in4=pd.read_csv('wipo_4_60_30Epoch.csv',header=None)

in1_predictions=in1.to_numpy()
in2_predictions=in2.to_numpy()
in3_predictions=in3.to_numpy()
in4_predictions=in4.to_numpy()

in1_prediction = np.argmax(in1_predictions, axis = -1) 
in2_prediction = np.argmax(in2_predictions, axis = -1) 
in3_prediction = np.argmax(in3_predictions, axis = -1) 
in4_prediction = np.argmax(in4_predictions, axis = -1) 

In [None]:
true=pd.read_csv('multihop_encoded_test.csv',header=None)
true=true.to_numpy()

true_main=pd.read_csv('multihop_encoded_test_main_target.csv',header=None)
true_main=true_main.to_numpy()

In [None]:
number_of_test_data_p1=in1_predictions.shape[0] 
number_of_codes=in1_predictions.shape[1] 

In [None]:
def ensemble_predictions4(predictions_p1, predictions_p2,predictions_p3, predictions_p4, number_of_test_data_p1, en):

#This functions calculates the averaging of predictions for each label
    
    average_predictions = []
    i=0
    
    for i in range(number_of_test_data_p1):
        a=np.mean([predictions_p1[i], predictions_p2[i], predictions_p3[i], predictions_p4[i]], axis=0)
        average_predictions.append(a)
    
    average_predictions_2 = np.array(average_predictions)      
    
    average_prediction = np.argmax(average_predictions, axis = -1) 

    print('The ensemble predictions have been calculated! \n')

    return average_predictions_2, average_prediction

ensembles=4 #the number of base classifiers combined using an avaraging function
average_predictions, average_prediction=ensemble_predictions4(in1_predictions, in2_predictions, in3_predictions, in4_predictions, number_of_test_data_p1, ensembles)

#calculate the P@1, R@1 and F1@1 vs all labels
calculate_metrix(number_of_test_data_p1, number_of_codes, average_predictions, 1, true)
#calculate the P@1, R@1 and F1@1 vs main label
calculate_metrix(number_of_test_data_p1, number_of_codes, average_predictions, 1, true_main)
#calculate the P@3, R@3 and F1@3 vs main label
calculate_metrix(number_of_test_data_p1, number_of_codes, average_predictions, 3, true_main)