### Libraries

In [1]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import LSTM, GRU, Bidirectional
from tensorflow.keras.optimizers import SGD, RMSprop, Adadelta, Adam
from tensorflow.keras.preprocessing import text, sequence
import numpy as np
import pandas as pd
import io
import csv
from sklearn import model_selection, preprocessing, metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, StratifiedKFold

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Help functions

In [2]:
def load_patents_single_label(file1):

#This function loads the data

    trainDF = pd.read_csv(file1, header=None, usecols=[0,1])
    trainDF=trainDF.rename(columns={0: 'label'})
    trainDF=trainDF.rename(columns={1: 'text'})

    return trainDF

def load_patents_multi_label(file1):

#This function loads the data

    trainDF = pd.read_csv(file1, header=None, usecols=[1,2])
    trainDF=trainDF.rename(columns={2: 'labels'})
    trainDF=trainDF.rename(columns={1: 'text'})

    return trainDF

def encode_single_labels(trainDF):
    
#This function encodes the labels with OneHotEncoder

    labels_val=trainDF['label'].values

    onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
    onehot_encoded = onehot_encoder.fit_transform(labels_val.reshape(-1, 1))
    
    return onehot_encoded, onehot_encoder

def encode_multi_labels(trainDF, trainDF_targets):
        
    labels_val_main_target=trainDF['label']
    labels_val_main_target = labels_val_main_target.str.split(',')
    
    labels_val=trainDF_targets['labels']
    labels_val = labels_val.str.split(',')

    labels_all=pd.concat([labels_val,labels_val_main_target])

    multihop_encoder = MultiLabelBinarizer()
    multihop_encoded = multihop_encoder.fit_transform(labels_all)
    
    splitted=np.array_split(multihop_encoded, 2)

    return multihop_encoder, splitted[0], splitted[1]

def enumarate_codes(onehot_encoded):

#This function encounters the total number of labels

    number_of_codes=np.shape(onehot_encoded)
    number_of_codes=number_of_codes[1]
    print("Number of codes: ", number_of_codes, "\n")
    
    return number_of_codes

def split_dataset(trainDF, onehot_encoded):

# This function splits the data into train, validation and test set (80:10:10)
    
    train_x, valid_x, train_y, valid_y = train_test_split(trainDF['text'], onehot_encoded, test_size=0.2, random_state=42) #stratify=onehot_encoded
    test_x, valid_x, test_y, valid_y = train_test_split(valid_x, valid_y, test_size=0.5, random_state=41)
        
    #Number of data per split
    
    number_of_train_data=np.shape(train_x)
    number_of_train_data=number_of_train_data[0]
    print("Number of train data:", number_of_train_data)

    number_of_valid_data=np.shape(valid_x)
    number_of_valid_data=number_of_valid_data[0]
    print("Number of validation data:",number_of_valid_data)

    number_of_test_data=np.shape(test_x)
    number_of_test_data=number_of_test_data[0]
    print("Number of test data:",number_of_test_data, "\n")
    
    return train_x, train_y,  valid_x, valid_y, test_x, test_y, number_of_test_data

def tokenize_text(trainDF):

#This function tokenizes the text
        
    token = text.Tokenizer()
    token.fit_on_texts(trainDF['text'])
    word_index = token.word_index
    print('Number of unique words:',len(word_index), "\n")
    
    return token, word_index

def convert_text(number_of_words, token, train_x, valid_x, test_x):

# This function converts the text to sequence of tokens and pad them till maxlen to ensure equal length vectors
    
    maxlen=number_of_words

    train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen)
    valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen)
    test_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_x), maxlen)
    print('convert text to tokens - Done! \n')

    return train_seq_x, valid_seq_x, test_seq_x

def load_language_model(fname):

# This function loads the language model

    data = {}
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())    
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    
    print("load_patentVec-Done! \n")
    
    return data

def create_embedding_matrix(embeddings_index, word_index):

# This function creates a token-embedding matrix
    
    num_words=len(word_index) + 1
    embedding_matrix = np.zeros((num_words, 300))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix, num_words

def calculate_probabilities(multihop_encoded_train):

# This function transforms the multilabel encoding into probabilities, e.g. 1 1 0 0 -> 0.5 0.5 0 0  


    a = np.zeros((multihop_encoded_train.shape))

    for i in range(len(multihop_encoded_train)):
        sum_of_secondary_codes=sum(multihop_encoded_train[i])
        #print(sum_of_secondary_codes)

        for j in range(len(multihop_encoded_train[i])):
            if multihop_encoded_train[i][j]==1:
                a[i][j]=float(1/sum_of_secondary_codes)
    print('calculate_probabilities-Done! \n')

    return a

def make_predictions(test_seq_x, test_y, classifier):

#this function makes predictions on test data based on trained model

    predictions = classifier.predict(test_seq_x)
    prediction = np.argmax(predictions, axis = -1) 
    y_true = np.argmax(test_y,axis = -1)
    print('make_predictions-Done! \n')

    return predictions, prediction, y_true

def calculate_metrics_single_label(predictions, prediction, y_true, number_of_test_data):
    
    #Accuracy 
    accuracy_total=metrics.accuracy_score(prediction, y_true)*100
    print("Accuracy:", accuracy_total)

    #MRR, P@3, P@5, P@10
    all_rr=[]
    number_of_top_three=0
    number_of_top_five=0
    number_of_top_ten=0

    predictions_2=predictions.argsort()
    predictions_3=np.fliplr(predictions_2)
    for i in range (0, number_of_test_data):
        specific_prediction=predictions_3[i,:]
        list1 = specific_prediction.tolist()
        target=y_true[i]
        prediction_rank=list1.index(target)+1 
        #MRR
        RR=1/prediction_rank
        all_rr.append(RR)
        #P@3
        if prediction_rank<= 3:
            number_of_top_three=number_of_top_three+1
        #P@5
        if prediction_rank<= 5:
            number_of_top_five=number_of_top_five+1     
        #P@10
        if prediction_rank<= 10:
            number_of_top_ten=number_of_top_ten+1 
    MRR=np.mean(all_rr)
    print("MRR:", MRR)
    P3=number_of_top_three/number_of_test_data*100
    print("P@3:", P3)
    P5=number_of_top_five/number_of_test_data*100
    print("P@5:", P5)
    P10=number_of_top_ten/number_of_test_data*100
    print("P@10:", P10)
    
    return accuracy_total, MRR, P3, P5, P10

def calculate_metrics_multi_label(number_of_test_samples, number_of_codes, predictions_kull, nn, multihop_encoded_test):

# This function calculates the precision, recall and f1 score metrics

    nn=nn+1
    pred_class_kull=np.empty((number_of_test_samples, number_of_codes))
    for row in range(number_of_test_samples):
        predictions_p1_sort2=np.argsort(predictions_kull[row])[:-nn:-1]
        class_number_zeros=np.zeros(number_of_codes)
        for class_number in predictions_p1_sort2:
            class_number_zeros[class_number]=1

        pred_class_kull[row][:]=class_number_zeros
    #print(pred_class_kull.shape)                      
    print(metrics.precision_score(multihop_encoded_test, pred_class_kull, average='micro')*100)
    print(metrics.recall_score(multihop_encoded_test, pred_class_kull, average='micro')*100)
    print(metrics.f1_score(multihop_encoded_test, pred_class_kull, average='micro')*100)                    
    
    return None

def kill_model():
    try:
        K.clear_session()
        del model
    except:
        print('No model to clear \n')    

### Classification models

In [3]:
def create_bidirectional_lstm(maxlen, num_words, number_of_codes, embedding_matrix):
    
    # Add an input layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)
    
    # Add the spatial dropout layer
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)

    # Add a bi-directional layer
    lstm_layer = layers.Bidirectional(layers.LSTM(100, recurrent_dropout=0.1, dropout=0.1))(embedding_layer)

    # Add the output layer
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    
    return model

def create_bidirectional_lstm_200(maxlen, num_words, number_of_codes, embedding_matrix):
    
    # Add an input layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)
    
    # Add the spatial dropout layer
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)

    # Add a bi-directional layer
    lstm_layer = layers.Bidirectional(layers.LSTM(200, recurrent_dropout=0.1, dropout=0.1))(embedding_layer)

    # Add the output layer
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    
    return model

def create_bidirectional_gru(maxlen, num_words, number_of_codes, embedding_matrix):
    
    # Add an input layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)

    # Add the spatial dropout layer
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)

    # Add a bi-directional layer
    lstm_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)
    
    # Add the output layer
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    
    return model

def create_lstm(maxlen, num_words, number_of_codes, embedding_matrix):
    
    # Add an input layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)

    # Add the spatial dropout layer
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)

    # Add a bi-directional layer
    lstm_layer = layers.LSTM(100, recurrent_dropout=0.1, dropout=0.1)(embedding_layer)
    
    # Add the output layer
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    
    return model

def create_gru(maxlen, num_words, number_of_codes, embedding_matrix):
    
    # Add an input layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)

    # Add the spatial dropout layer
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)

    # Add a bi-directional layer
    lstm_layer = layers.GRU(100)(embedding_layer)
  
    # Add the output layers
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    
    return model

def create_bidirectional_lstm_probabilities_kullback(maxlen, num_words, number_of_codes, embedding_matrix):

#This function creates a classification model

    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)

    # Add the spatial dropout layer
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)

    # Add a bi-directional layer
    lstm_layer = layers.Bidirectional(layers.LSTM(100, recurrent_dropout=0.1, dropout=0.1))(embedding_layer)
    
    # Add the output layers
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    
    return model

### Main 1: Different sections

In [12]:
number_of_words=60
epochs = 15
batch_size = 128
parts=[1, 2, 3, 4]

file1= "/datasets/title.csv"
file2= "/datasets/abstract.csv"
file3 = "/datasets/description.csv"
file4 = "/datasets/claims.csv"

for i, part in enumerate(parts):
    
    print("Part: ", part)
    
    if part==1:
        trainDF_p1 = load_patents_single_label(file1)
    if part==2:
        trainDF_p1 = load_patents_single_label(file2)
    if part==3:
        trainDF_p1 = load_patents_single_label(file3)
    if part==4:
        trainDF_p1 = load_patents_single_label(file4)
   
    onehot_encoded, onehot_encoder=encode_single_labels(trainDF_p1)        
    number_of_codes=enumarate_codes(onehot_encoded)
    
    train_x_p1, train_y_p1, valid_x_p1, valid_y_p1, test_x_p1, test_y_p1, number_of_test_data_p1=split_dataset\
    (trainDF_p1, onehot_encoded)
    
    token_p1, word_index_p1=tokenize_text(trainDF_p1)

    train_seq_x_p1, valid_seq_x_p1, test_seq_x_p1 =convert_text(number_of_words, token_p1, train_x_p1, valid_x_p1, test_x_p1)

    if part==1:
        embeddings_index = load_language_model('/embeddings/patent-300.vec')
    embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1)
    
    kill_model()
    classifier1 = create_bidirectional_lstm(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
    history1=classifier1.fit(train_seq_x_p1, train_y_p1, epochs=epochs, batch_size=batch_size, verbose=1)
    
    predictions_p1, prediction_p1, y_true1=make_predictions(test_seq_x_p1, test_y_p1, classifier1)
    accuracy_total1, MRR1, P3_1, P5_1, P10_1 = calculate_metrics_single_label(predictions_p1, prediction_p1, y_true1, number_of_test_data_p1)
    
    #Save the final predictions
    #df.sort_values(by=0, axis=1, ascending=False)
    #file_name="clefip_part"+str(part)".csv"
    #df.to_csv(file_name, header=False, index=False)
        
    #Save qrel
    #df=pd.DataFrame(y_true1)
    #df.sort_values(by=0, axis=1, ascending=False)
    #file_name="clefip_qrel.csv"
    #df.to_csv(file_name, header=False, index=False)

1
Number of codes:  731 

Number of train data: 432904
Number of validation data: 54114
Number of test data: 54113 

Number of unique words: 66352 

convert text to tokens - Done! 

No model to clear 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 60)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 60, 300)           19905900  
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 60, 300)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense_2 (Dense)              (None, 731)               146931    
Total params: 20,373,631
Trainable params: 467,731
Non-trainable params:

### Main 2: Different classification models

In [None]:
number_of_words=60
epochs = 15
batch_size = 128
cl_models=[1, 2, 3, 4]

file1= "/datasets/abstract.csv"

trainDF_p1 = load_patents_single_label(file1)

onehot_encoded, onehot_encoder=encode_single_labels(trainDF_p1)        
number_of_codes=enumarate_codes(onehot_encoded)

train_x_p1, train_y_p1, valid_x_p1, valid_y_p1, test_x_p1, test_y_p1, number_of_test_data_p1=\
split_dataset(trainDF_p1, onehot_encoded)

token_p1, word_index_p1=tokenize_text(trainDF_p1)

train_seq_x_p1, valid_seq_x_p1, test_seq_x_p1 =convert_text(number_of_words, token_p1, train_x_p1, valid_x_p1, test_x_p1)

embeddings_index = load_language_model('/embeddings/patent-300.vec')
embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1)

#for releasing resources we can delete the language model after creating the embedding matrix
#del embeddings_index

kill_model()

for i, cl_model in enumerate(cl_models):
    
    if cl_model==1:
        classifier1 = create_bidirectional_lstm(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
    if cl_model==2:
        classifier1 = create_bidirectional_gru(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
    if cl_model==3:
        classifier1 = create_lstm(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
    if cl_model==4:
        classifier1 = create_gru(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
    
    history1=classifier1.fit(train_seq_x_p1, train_y_p1, epochs=epochs, batch_size=batch_size, verbose=1)
    predictions_p1, prediction_p1, y_true1=make_predictions(test_seq_x_p1, test_y_p1, classifier1)
    accuracy_total1, MRR1, P3_1, P5_1, P10_1 = calculate_metrics_single_label(predictions_p1, prediction_p1, y_true1, number_of_test_data_p1)

    #Save the final predictions
    #df.sort_values(by=0, axis=1, ascending=False)
    #file_name="clefip_model"+str(model)".csv"
    #df.to_csv(file_name, header=False, index=False)

### Main 3: Different architecture

In [None]:
number_of_words=60
epochs = 15
batch_size = 128
networks=[1, 2]

file1= "/datasets/abstract.csv"

trainDF_p1 = load_patents_single_label(file1)

onehot_encoded, onehot_encoder=encode_single_labels(trainDF_p1) 
number_of_codes=enumarate_codes(onehot_encoded)

train_x_p1, train_y_p1, valid_x_p1, valid_y_p1, test_x_p1, test_y_p1, number_of_test_data_p1=\
split_dataset(trainDF_p1, onehot_encoded)

token_p1, word_index_p1=tokenize_text(trainDF_p1)

train_seq_x_p1, valid_seq_x_p1, test_seq_x_p1 =convert_text(number_of_words, token_p1, train_x_p1, valid_x_p1, test_x_p1)

embeddings_index = load_language_model('/embeddings/patent-300.vec')
embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1)
#del embeddings_index

kill_model()

for i, network in enumerate(networks):
    
    if network==1:
        classifier1 = create_bidirectional_lstm(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
    if network==2:
        classifier1 = create_bidirectional_lstm_200(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
   
    history1=classifier1.fit(train_seq_x_p1, train_y_p1, epochs=epochs, batch_size=batch_size, verbose=1)
    predictions_p1, prediction_p1, y_true1=make_predictions(test_seq_x_p1, test_y_p1, classifier1)
    accuracy_total1, MRR1, P3_1, P5_1, P10_1 = calculate_metrics_single_label(predictions_p1, prediction_p1, y_true1, number_of_test_data_p1)

    #Save the final predictions
    #df.sort_values(by=0, axis=1, ascending=False)
    #file_name="clefip_network"+str(network)".csv"
    #df.to_csv(file_name, header=False, index=False)

### Main 4: Different training models

In [None]:
number_of_words=60
epochs = 1
batch_size = 128
training_models=[1, 2] 
#1: get the main and all labels and encode them using MultiBinalizer, then, use only the main labels for the training
#2: get all labels and encode them using MultiBinalizer, then, use all labels for the training 

file1= "F:/PhD/Datasets-Results/clefip/Datasets/I3_dataset_multilabel/abstract.csv"

trainDF_p1 = load_patents_single_label(file1)
trainDF_p1_multi = load_patents_multi_label(file1)

multihop_encoder, multihop_encoded_all, multihop_encoded_main=encode_multi_labels(trainDF_p1, trainDF_p1_multi)       
number_of_codes=enumarate_codes(multihop_encoded_all)

train_x_main, train_y_main, valid_x_main, valid_y_main, test_x_main, test_y_main, number_of_test_data_main=\
split_dataset(trainDF_p1, multihop_encoded_main)   
train_x_all, train_y_all, valid_x_all, valid_y_all, test_x_all, test_y_all, number_of_test_data_all=\
split_dataset(trainDF_p1_multi, multihop_encoded_all)

a=calculate_probabilities(multihop_encoded_all)

train_x_a, train_y_a, valid_x_a, valid_y_a, test_x_a, test_y_a, number_of_test_data_a=split_dataset(trainDF_p1, a)

token_p1, word_index_p1=tokenize_text(trainDF_p1)

train_seq_x_p1, valid_seq_x_p1, test_seq_x_p1 =convert_text(number_of_words, token_p1, train_x_main, valid_x_main, test_x_main)

embeddings_index = load_language_model('F:/PhD/Datasets-Results/embeddings/patent-300.vec')
embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1)

del embeddings_index

kill_model()

for i, training_model in enumerate(training_models):
    
    if training_model==1:
        classifier1 = create_bidirectional_lstm(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
        history1=classifier1.fit(train_seq_x_p1, train_y_main, epochs=epochs, batch_size=batch_size, verbose=1)
        predictions_p1, prediction_p1, y_true1=make_predictions(test_seq_x_p1, test_y_main, classifier1)
        accuracy_total1, MRR1, P3_1, P5_1, P10_1 = calculate_metrics_single_label(predictions_p1, prediction_p1, y_true1, number_of_test_data_main)
    
    if training_model==2:  
        classifier1 = create_bidirectional_lstm_probabilities_kullback(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
        history1=classifier1.fit(train_seq_x_p1, train_y_a, epochs=epochs, batch_size=batch_size, verbose=1)
        predictions_p1, prediction_p1, y_true1=make_predictions(test_seq_x_p1, test_y_main, classifier1)
        accuracy_total1, MRR1, P3_1, P5_1, P10_1 = calculate_metrics_single_label(predictions_p1, prediction_p1, y_true1, number_of_test_data_main)
    
    #Save the final predictions
    #df.sort_values(by=0, axis=1, ascending=False)
    #file_name="clefip_training_model"+str(training_model)".csv"
    #df.to_csv(file_name, header=False, index=False)

### Load the stored predictions and qrel and create the ensemble of above classifiers

In [4]:
#load the predictions created by different classifiers
in1=pd.read_csv('predictions_p1.csv',header=None)
in2=pd.read_csv('predictions_p2.csv',header=None)
in3=pd.read_csv('predictions_p3.csv',header=None)

in1_predictions=in1.to_numpy()
in2_predictions=in2.to_numpy()
in3_predictions=in3.to_numpy()

in1_prediction = np.argmax(in1_predictions, axis = -1) 
in2_prediction = np.argmax(in2_predictions, axis = -1) 
in3_prediction = np.argmax(in3_predictions, axis = -1) 

In [5]:
true=pd.read_csv('qrel_numbers.csv',header=None)
true=true.to_numpy()

In [6]:
number_of_test_data=in1_predictions.shape[0] 
number_of_codes=in1_predictions.shape[1] 

In [None]:
def ensemble_predictions3(predictions_p1, predictions_p2, predictions_p3, number_of_test_data_p1, en):

#This functions calculates the averaging of predictions for each label

    average_predictions = []
    i=0
    
    for i in range(number_of_test_data_p1):
        a=np.mean([predictions_p1[i], predictions_p2[i], predictions_p3[i]], axis=0)
        average_predictions.append(a)
    
    average_predictions_2 = np.array(average_predictions)      
    
    average_prediction = np.argmax(average_predictions, axis = -1) 

    print('The ensemble predictions have been calculated! \n')

    return average_predictions_2, average_prediction

ensembles=3 #the number of base classifiers combined using an avaraging function
average_predictions, average_prediction=ensemble_predictions3(in1_predictions, in2_predictions, in3_predictions, number_of_test_data, ensembles)
accuracy_total1, MRR1, P3_1, P5_1, P10_1 = calculate_metrics_single_label(average_predictions, average_prediction, true, number_of_test_data)
