### Import the libraries

In [None]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import LSTM, GRU, Bidirectional
from tensorflow.keras.optimizers import SGD, RMSprop, Adadelta, Adam
from tensorflow.keras.preprocessing import text, sequence
import numpy as np
import pandas as pd
import io
import csv
from sklearn import model_selection, preprocessing, metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer
import re
import nltk
from nltk.corpus import stopwords
stopwords_dict=set(stopwords.words("english-v2-uspto-sklearn"))

### Help functions

In [2]:
# Load the csv (with the 544 codes) from Arousha

def load_train_data(number_of_words, part, file):

### This function loads and prepares the train data
    
    trainDF = pd.read_csv(file, header=0)
    
    if part==0: #Title
        del trainDF['text'], trainDF['Abstract']
        trainDF=trainDF.rename(columns={'Title': 'text'})
    if part==1: #Abstract
        del trainDF['text'], trainDF['Title']    
        trainDF=trainDF.rename(columns={'Abstract': 'text'})
    if part==2: #Concat abstract and title
        del testDF['Abstract'], testDF['Title']

    #Process the text column
    
    trainDF['text']=trainDF['text'].fillna("")
    trainDF['text']=trainDF['text'].replace('[^a-z]', ' ', regex=True)       
    trainDF['text']=trainDF['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_dict)]))
    trainDF['text']=trainDF['text'].str.split().str[0:number_of_words]
    trainDF['text']=trainDF['text'].str.join(' ')

    #Process the labels column
    
    trainDF['labels']=trainDF['labels'].str.split(',')
    
    print("Loaded")
   
    return trainDF

def load_test_data(number_of_words, part, file):

### This function loads and prepares the test data

    testDF = pd.read_csv(file, header=0)

    if part==0: #Title
        del testDF['text'], testDF['Abstract']
        testDF=testDF.rename(columns={'Title': 'text'})
    if part==1: #Abstract
        del testDF['text'], testDF['Title']
        testDF=testDF.rename(columns={'Abstract': 'text'})
    if part==2: #Concat abstract and title
        del testDF['Abstract'], testDF['Title']
        
    #Process the text column
    
    testDF['text']=testDF['text'].fillna("")
    testDF['text']=testDF['text'].replace('[^a-z]', ' ', regex=True)       
    testDF['text']=testDF['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_dict)]))
    testDF['text']=testDF['text'].str.split().str[0:number_of_words]
    testDF['text']=testDF['text'].str.join(' ')

    #Process the labels column
    
    testDF['labels']=testDF['labels'].str.split(',')
    
    print("Loaded")
   
    return testDF


def encode_multilabel(trainDF):

#This function encodes the labels with MultiLabelBinarizer
        
    labels_val=trainDF['labels']
    
    multihop_encoder = MultiLabelBinarizer()
    multihop_encoded = multihop_encoder.fit_transform(labels_val)
    
    return multihop_encoder, multihop_encoded

def enumarate_codes(onehot_encoded):

#This function encounters the total number of labels

    number_of_codes=np.shape(onehot_encoded)
    number_of_codes=number_of_codes[1]
    print("Number of codes: ", number_of_codes, "\n")
    
    return number_of_codes

def tokenize_text(trainDF):

#This function tokenizes the text
        
    token = text.Tokenizer()
    token.fit_on_texts(trainDF['text'])
    word_index = token.word_index
    print('Number of unique words:',len(word_index), "\n")
    
    return token, word_index

def convert_text(number_of_words, token, x):

# This function converts the text to sequence of tokens and pad them till maxlen to ensure equal length vectors
    
    maxlen=number_of_words

    seq_x = sequence.pad_sequences(token.texts_to_sequences(x), maxlen)
    print('convert text to tokens - Done! \n')

    return seq_x

def load_language_model(fname):

# This function loads the language model

    data = {}
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())    
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    print("load_patentVec-Done! \n")
    
    return data

def create_embedding_matrix(embeddings_index, word_index):

# This function creates a token-embedding matrix

    num_words=len(word_index) + 1
    embedding_matrix = np.zeros((num_words, 300))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix, num_words
        
def calculate_probabilities(multihop_encoded_train):

# This function transforms the multilabel encoding into probabilities, e.g. 1 1 0 0 -> 0.5 0.5 0 0  


    a = np.zeros((multihop_encoded_train.shape))

    for i in range(len(multihop_encoded_train)):
        sum_of_secondary_codes=sum(multihop_encoded_train[i])
        #print(sum_of_secondary_codes)

        for j in range(len(multihop_encoded_train[i])):
            if multihop_encoded_train[i][j]==1:
                a[i][j]=float(1/sum_of_secondary_codes)
    print('calculate_probabilities-Done! \n')

    return a

def create_bidirectional_lstm_probabilities_kullback(maxlen, num_words, number_of_codes, embedding_matrix):

#This function creates a classification model

    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)

    # Add a bi-directional layer
    lstm_layer = layers.Bidirectional(layers.LSTM(100, recurrent_dropout=0.1, dropout=0.1))(embedding_layer)

    
    # Add the output Layers
   # output_layer1 = layers.Dropout(0.25)(lstm_layer)
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='Adam', loss='kullback_leibler_divergence', metrics=['accuracy'])

    model.summary()
    
    return model

def make_predictions(test_seq_x, test_y, classifier):

#this function makes predictions on test data based on trained model

    predictions = classifier.predict(test_seq_x)
    prediction = np.argmax(predictions, axis = -1) 
    y_true = np.argmax(test_y,axis = -1)
    print('make_predictions-Done! \n')

    return predictions, prediction, y_true

def calculate_metrix(number_of_test_samples, number_of_codes, predictions_kull, nn, multihop_encoded_test):

# This function calculates the precision, recall and f1 score metrics

    nn=nn+1
    pred_class_kull=np.empty((number_of_test_samples, number_of_codes))
    for row in range(number_of_test_samples):
        predictions_p1_sort2=np.argsort(predictions_kull[row])[:-nn:-1]
        class_number_zeros=np.zeros(number_of_codes)
        for class_number in predictions_p1_sort2:
            class_number_zeros[class_number]=1

        pred_class_kull[row][:]=class_number_zeros
    #print(pred_class_kull.shape)                      
    print(metrics.precision_score(multihop_encoded_test, pred_class_kull, average='micro')*100)
    print(metrics.recall_score(multihop_encoded_test, pred_class_kull, average='micro')*100)
    print(metrics.f1_score(multihop_encoded_test, pred_class_kull, average='micro')*100)                    
    
    return None

def kill_model():
    try:
        K.clear_session()
        del model
    except:
        print('No model to clear \n')

### Load the dataset and run the main code

In [3]:
number_of_words = 100
epochs = 15
batch_size = 128
parts = [0, 1, 2] #0: Title, 1: Abstract, 2: Concatenated title and abstract 

for j, part in enumerate(parts):
        
    trainDF=load_train_data(number_of_words, part,'/datasets/USPTO_train_data_544.csv')     
    testDF=load_test_data(number_of_words, part,'/datasets/USPTO_test_data_544.csv')

    multihop_encoder, encoded_trainDF=encode_multilabel(trainDF)
   
    ### Encode the labels of test data using the multihop_encoder created on train data
    labels_val_2=testDF['labels']
    encoded_testDF = multihop_encoder.fit_transform(labels_val_2)
    
    number_of_codes=enumarate_codes(encoded_testDF)

    token_p1, word_index_p1=tokenize_text(trainDF.append(testDF))

    train_seq_x_p1 =convert_text(number_of_words, token_p1, trainDF['text'])   
    test_seq_x_p1 =convert_text(number_of_words, token_p1, testDF['text'])

    if part == 0:
        embeddings_index = load_language_model('/embeddings/patent-300.vec')
    embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1)
    #del embeddings_index
    
    kill_model()

    a=calculate_probabilities(encoded_trainDF)

    classifier = create_bidirectional_lstm_probabilities_kullback(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
    history=classifier.fit(train_seq_x_p1, a, epochs=epochs, batch_size=batch_size, verbose=1)

    predictions_kull, prediction_kull, y_true_kull=make_predictions(test_seq_x_p1, encoded_testDF, classifier)

    number_of_test_data_p1=np.shape(test_seq_x_p1)
    number_of_test_data_p1=number_of_test_data_p1[0]

    #calculate the P@1, R@1 and F1@1 vs all labels
    calculate_metrix(number_of_test_data_p1, number_of_codes, predictions_kull, 1, encoded_testDF)
    #calculate the P@3, R@3 and F1@3 vs all labels
    calculate_metrix(number_of_test_data_p1, number_of_codes, predictions_kull, 3, encoded_testDF)
    #calculate the P@5, R@5 and F1@5 vs all labels
    calculate_metrix(number_of_test_data_p1, number_of_codes, predictions_kull, 5, encoded_testDF)
    
    #Save the final predictions
    #df=pd.DataFrame(predictions_kull)
    #df.sort_values(by=0, axis=1, ascending=False)
    #df.to_csv("uspto_arousha_part"+str(part)+".csv", header=False, index=False)
    #Save qrels
    #df=pd.DataFrame(encoded_testDF)
    #df.sort_values(by=0, axis=1, ascending=False)
    #df.to_csv('uspto_arousha_reference.csv', header=False, index=False)

Loaded
Loaded
Number of codes:  544 

Number of unique words: 93923 

convert text to tokens - Done! 

convert text to tokens - Done! 

load_patentVec-Done! 

No model to clear 

calculate_probabilities-Done! 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 300)          28177200  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 100, 300)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               320800    
_________________________________________________________________
dense (Dense)                (None, 544)               109344    
Total params: 28,607,344
Trainable params: 430,144
Non-trainabl

  _warn_prf(average, modifier, msg_start, len(result))


47.38131941871751
19.11951662485183
24.88365475187849


### Load the stored predictions and qrel and create the ensemble of above classifiers

In [None]:
in1=pd.read_csv('uspto_arousha_part0.csv',header=None)
in2=pd.read_csv('uspto_arousha_part1.csv',header=None)
in3=pd.read_csv('uspto_arousha_part2.csv',header=None)

in1_predictions=in1.to_numpy()
in2_predictions=in2.to_numpy()
in3_predictions=in3.to_numpy()

in1_prediction = np.argmax(in1_predictions, axis = -1) 
in2_prediction = np.argmax(in2_predictions, axis = -1) 
in3_prediction = np.argmax(in3_predictions, axis = -1) 

In [None]:
true=pd.read_csv('uspto_arousha_reference.csv',header=None)
true=true.to_numpy()

In [None]:
number_of_test_data_p1=in1_predictions.shape[0] 
number_of_codes=in1_predictions.shape[1] 

In [None]:
def ensemble_predictions3(predictions_p1, predictions_p2, predictions_p3, number_of_test_data_p1, en):

#This functions calculates the averaging of predictions for each label

    average_predictions = []
    i=0
    
    for i in range(number_of_test_data_p1):
        a=np.mean([predictions_p1[i], predictions_p2[i], predictions_p3[i]], axis=0)
        average_predictions.append(a)
    
    average_predictions_2 = np.array(average_predictions)      
    
    average_prediction = np.argmax(average_predictions, axis = -1) 

    print('The ensemble predictions have been calculated! \n')

    return average_predictions_2, average_prediction

ensembles=3 #the number of base classifiers combined using an avaraging function
average_predictions, average_prediction=ensemble_predictions3(in1_predictions, in2_predictions, in3_predictions, number_of_test_data_p1, ensembles)

#calculate the P@1, R@1 and F1@1 vs all labels
calculate_metrix(number_of_test_data_p1, number_of_codes, average_predictions, 1, true)
#calculate the P@3, R@3 and F1@3 vs all labels
calculate_metrix(number_of_test_data_p1, number_of_codes, average_predictions, 3, true)
#calculate the P@5, R@5 and F1@5 vs all labels
calculate_metrix(number_of_test_data_p1, number_of_codes, average_predictions, 5, true)