### Libraries

In [7]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import LSTM, GRU, Bidirectional
from tensorflow.keras.optimizers import SGD, RMSprop, Adadelta, Adam
from tensorflow.keras.preprocessing import text, sequence
import numpy as np
import pandas as pd
import io
import csv
from sklearn import model_selection, preprocessing, metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
import re
import nltk
from nltk.corpus import stopwords
stopwords_dict=set(stopwords.words("english-v2-uspto-sklearn"))

### Help functions

In [8]:
def process_data(part, trainDF):

#This function presesses the text and the labels that have been loaded in a dataframe

# part: the textual field that we will use 
# trainDF: the dataframe

    #text
    if part==1:
        trainDF=trainDF.rename(columns={'keywords': 'text'})
    elif part==2:
        trainDF=trainDF.rename(columns={'Abstract': 'text'})

    #labels    
    trainDF=trainDF.rename(columns={'Y': 'label'})
   
    #replace the na rows with "" otherwise it returns an error
    trainDF['text']=trainDF['text'].fillna("")
    #lowercase
    trainDF['text']=trainDF['text'].str.lower()
    #delete all symbols except for a-z
    trainDF['text']=trainDF['text'].replace('[^a-z]', ' ', regex=True)       
    #delete the stopwords
    trainDF['text']=trainDF['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_dict)]))
    #keep the first #number of words
    trainDF['text']=trainDF['text'].str.split().str[0:number_of_words]
    trainDF['text']=trainDF['text'].str.join(' ')
    
    print("The data has been loaded! \n")
    
    return trainDF

def encode_labels(trainDF):

#This function encodes the labels with OneHotEncoder

# trainDF: the dataframe that we will use 

    labels_val=trainDF['label'].values

    onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
    onehot_encoded = onehot_encoder.fit_transform(labels_val.reshape(-1, 1))
    print("The labels have been encoded! \n")

    return onehot_encoder, onehot_encoded

def enumarate_codes(onehot_encoded):

#This function encounters the total number of labels

    number_of_codes=np.shape(onehot_encoded)
    number_of_codes=number_of_codes[1]
    print("Number of labels: ", number_of_codes, "\n")
    
    return number_of_codes

def split_dataset(trainDF, onehot_encoded):

#This function splits the dataset into training and rtesting set

    train_x, test_x, train_y, test_y = train_test_split(trainDF['text'], onehot_encoded, test_size=0.1, random_state=42) #stratify=onehot_encoded
    print("The dataset has been splitted into training and testing set! \n")
    
    return train_x, train_y,  test_x, test_y

def tokenize_text(trainDF):

#This function tokenizes the text

    token = text.Tokenizer()
    token.fit_on_texts(trainDF['text'])
    word_index = token.word_index
    print('Number of unique words:',len(word_index), "\n")
    
    return token, word_index

def convert_text(number_of_words, token, x):

# This function converts the text to sequence of tokens and pad them till maxlen to ensure equal length vectors

    maxlen=number_of_words

    seq_x = sequence.pad_sequences(token.texts_to_sequences(x), maxlen)
    print("The text has been converted to tokens! \n")

    return seq_x

def load_language_model(fname):

# This function loads the language model

    embeddings_dict = {}
    with open(fname, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    print("The word embeddings has been loaded! \n")

    return embeddings_dict

def create_embedding_matrix(embeddings_index, word_index, x):

# This function creates a token-embedding matrix

    num_words=len(word_index) + 1
    embedding_matrix = np.zeros((num_words, x))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)         
    return embedding_matrix, num_words

def create_bidirectional_lstm(maxlen, num_words, number_of_codes, embedding_matrix):
    
# This function creates the classification model based on Bi-LSTM and categorical loss

    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))
    # Add the Embedding Layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)
    # Add the SpatialDropout1D Layer
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)
    # Add a Bidirectional Layer
    lstm_layer = layers.Bidirectional(layers.LSTM(100, recurrent_dropout=0.1, dropout=0.1))(embedding_layer)                                                                                        
    # Add the Output Layer
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
                                                                                            
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    
    return model

def make_predictions(test_seq_x, test_y, classifier):

# This function makes the predictions in the test data

    predictions = classifier.predict(test_seq_x)
    prediction = np.argmax(predictions, axis = -1) 
    y_true = np.argmax(test_y,axis = -1)
    print('The predictions on test data have been calculated! \n')

    return predictions, prediction, y_true                                                                                           

def calculate_metrics(predictions, prediction, y_true, number_of_test_data):

# This function calculates for single-label classification the accuracy, MRR, the accuracy/precision @3, @5 and @10
    
    #Accuracy 
    accuracy_total=metrics.accuracy_score(prediction, y_true)*100
    print("Accuracy:", accuracy_total)

    #MRR, P@3, P@5, P@10
    all_rr=[]
    number_of_top_three=0
    number_of_top_five=0
    number_of_top_ten=0

    predictions_2=predictions.argsort()
    predictions_3=np.fliplr(predictions_2)
    for i in range (0, number_of_test_data):
        specific_prediction=predictions_3[i,:]
        list1 = specific_prediction.tolist()
        target=y_true[i]
        prediction_rank=list1.index(target)+1 
        #MRR
        RR=1/prediction_rank
        all_rr.append(RR)
        #P@3
        if prediction_rank<= 3:
            number_of_top_three=number_of_top_three+1
        #P@5
        if prediction_rank<= 5:
            number_of_top_five=number_of_top_five+1     
        #P@10
        if prediction_rank<= 10:
            number_of_top_ten=number_of_top_ten+1 
    MRR=np.mean(all_rr)
    print("MRR:", MRR)
    P3=number_of_top_three/number_of_test_data*100
    print("P@3:", P3)
    P5=number_of_top_five/number_of_test_data*100
    print("P@5:", P5)
    P10=number_of_top_ten/number_of_test_data*100
    print("P@10:", P10)
    
    return accuracy_total, MRR, P3, P5, P10

def kill_model():
    try:
        K.clear_session()
        del model
    except:
        print('No model to clear \n')   

### Load the dataset and run the main code

In [10]:
#The below instalations are necessary to read the excel file of the dataset
#!pip install openpyxl 
#!pip install xlrd 

number_of_words = 180
epochs = 40
batch_size = 128
datasets=[1, 2, 3] #1: WOS46985, 2: WOS11967, 3: WOS5736 
parts=[1, 2] #1: keywords, 2: Abstract

for j, dataset in enumerate(datasets):
    
    print(dataset)
    
    file="/WebOfScience/Meta-data/Data.xlsx"
    initial_DF=pd.read_excel(file, sheet_name='abstracts', engine='openpyxl')

    if dataset == 2 or dataset == 3: 
        if dataset == 2: 
            file_part="/WebOfScience/WOS11967/X.txt"    
        if dataset == 3: 
            file_part="/WebOfScience/WOS5736/X.txt"    

        DF_part = pd.read_csv(file_part, header=None, sep="\t")
        DF_part=DF_part.rename(columns={0: 'Abstract'})

        DF = pd.merge(DF_part, initial_DF, how='left', on='Abstract')
    else:
        DF=initial_DF.copy()

    for i, part in enumerate(parts):
        
        print("DF shape:", DF.shape)       
        print(part)
        
        DF_processed=process_data(part, DF)

        onehot_encoder, onehot_encoded=encode_labels(DF_processed)
        number_of_codes=enumarate_codes(onehot_encoded)
        train_x_p1, train_y_p1, test_x_p1, test_y_p1 =split_dataset(DF_processed, onehot_encoded)
        token_p1, word_index_p1=tokenize_text(DF_processed)

        train_seq_x_p1 =convert_text(number_of_words, token_p1, train_x_p1)    
        test_seq_x_p1 =convert_text(number_of_words, token_p1, test_x_p1)    

        embeddings_index = load_language_model('/embeddings/glove.6B.300d.txt')
        embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1, 300)

        kill_model()
        classifier1 = create_bidirectional_lstm(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
        history1=classifier1.fit(train_seq_x_p1, train_y_p1, epochs=epochs, batch_size=batch_size, verbose=1)           

        #Save the trained classifier
        #classifier1.save("wos_11967_part"+str(part)+"_40epoch")

        number_of_test_data=np.shape(test_y_p1)
        number_of_test_data=number_of_test_data[0]

        predictions_p1, prediction_p1, y_true1=make_predictions(test_seq_x_p1, test_y_p1, classifier1)            
        accuracy_total1, MRR1, P3_1, P5_1, P10_1 = calculate_metrics(predictions_p1, prediction_p1, y_true1, number_of_test_data)

        #Save the final predictions
        #df=pd.DataFrame(predictions_p1)
        #df.sort_values(by=0, axis=1, ascending=False)
        #file_name="wos_"+str(dataset)+"_part"+str(part)+"_40epoch.csv"
        #df.to_csv(file_name, header=False, index=False)
        
    #Save qrel
    #df=pd.DataFrame(y_true1)
    #df.sort_values(by=0, axis=1, ascending=False)
    #file_name="wos_+str(dataset)+_rel_numbers.csv"
    #df.to_csv(file_name, header=False, index=False)

1
DF shape: (46985, 7)
1
The data has been loaded! 

The labels have been encoded! 

Number of labels:  134 

The dataset has been splitted into training and testing set! 

Number of unique words: 38976 

The text has been converted to tokens! 

The text has been converted to tokens! 

The word embeddings has been loaded! 

No model to clear 

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 180)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 180, 300)          11693100  
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 180, 300)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200)               320800    
___________________________________________

The data has been loaded! 

The labels have been encoded! 

Number of labels:  11 

The dataset has been splitted into training and testing set! 

Number of unique words: 36672 

The text has been converted to tokens! 

The text has been converted to tokens! 

The word embeddings has been loaded! 

No model to clear 

Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 180)]             0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 180, 300)          11001900  
_________________________________________________________________
spatial_dropout1d_8 (Spatial (None, 180, 300)          0         
_________________________________________________________________
bidirectional_8 (Bidirection (None, 200)               320800    
_________________________________________________________________
den

### Load the stored predictions and qrel and create the ensemble of above classifiers

In [None]:
#repeat for the other two datasets
in1=pd.read_csv('wos_11967_part1_40epoch.csv',header=None)
in2=pd.read_csv('wos_11967_part2_40epoch.csv',header=None)

in1_predictions=in1.to_numpy()
in2_predictions=in2.to_numpy()

in1_prediction = np.argmax(in1_predictions, axis = -1) 
in2_prediction = np.argmax(in2_predictions, axis = -1) 

In [None]:
true=pd.read_csv('wos_11967_rel_numbers.csv',header=None)
true=true.to_numpy()

In [None]:
number_of_test_data_p1=in1_prediction.shape[0] 

In [None]:
def ensemble_predictions2(predictions_p1, predictions_p2, number_of_test_data_p1, en):

#This functions calculates the averaging of predictions for each label
    
    average_predictions = []
    i=0
    
    for i in range(number_of_test_data_p1):
        a=np.mean([predictions_p1[i], predictions_p2[i]], axis=0)
        average_predictions.append(a)
    
    average_predictions_2 = np.array(average_predictions)      
    
    average_prediction = np.argmax(average_predictions, axis = -1) 
    print(type(average_predictions_2), type(average_prediction))

    print('The ensemble predictions have been calculated! \n')

    return average_predictions_2, average_prediction

ensembles=2 #the number of base classifiers combined using an avaraging function
average_predictions, average_prediction=ensemble_predictions2(in1_predictions, in2_predictions, number_of_test_data_p1, ensembles)
accuracy_total, MRR, P3, P5, P10 = calculate_metrics(average_predictions, average_prediction, true, number_of_test_data_p1)