### Libraries

In [2]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import LSTM, GRU, Bidirectional, Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import SGD, RMSprop, Adadelta, Adam
from tensorflow.keras.preprocessing import text, sequence
import numpy as np
import pandas as pd
import io
import csv
from sklearn import model_selection, preprocessing, metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, StratifiedKFold

### Help functions

In [10]:
def load_patents(file1):
    
# This function loads the data

    trainDF = pd.read_csv(file1, header=None, usecols=[0,1])
    trainDF=trainDF.rename(columns={0: 'label'})
    trainDF=trainDF.rename(columns={1: 'text'})

    return trainDF

def encode_labels(trainDF):
    
#This function encodes the labels with OneHotEncoder

    labels_val=trainDF['label'].values

    onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
    onehot_encoded = onehot_encoder.fit_transform(labels_val.reshape(-1, 1))
    
    return onehot_encoded, onehot_encoder

def enumarate_codes(onehot_encoded):

#This function encounters the total number of labels

    number_of_codes=np.shape(onehot_encoded)
    number_of_codes=number_of_codes[1]
    print("Number of codes: ", number_of_codes, "\n")
    
    return number_of_codes

def split_dataset(trainDF, onehot_encoded):

# This function splits the data into train, validation and test set (80:10:10)
    
    train_x, valid_x, train_y, valid_y = train_test_split(trainDF['text'], onehot_encoded, test_size=0.2, random_state=42) #stratify=onehot_encoded
    test_x, valid_x, test_y, valid_y = train_test_split(valid_x, valid_y, test_size=0.5, random_state=41)
        
    #Number of data per split
    
    number_of_train_data=np.shape(train_x)
    number_of_train_data=number_of_train_data[0]
    print("Number of train data:", number_of_train_data)

    number_of_valid_data=np.shape(valid_x)
    number_of_valid_data=number_of_valid_data[0]
    print("Number of validation data:",number_of_valid_data)

    number_of_test_data=np.shape(test_x)
    number_of_test_data=number_of_test_data[0]
    print("Number of test data:",number_of_test_data, "\n")
    
    return train_x, train_y,  valid_x, valid_y, test_x, test_y, number_of_test_data

def tokenize_text(trainDF):

#This function tokenizes the text
        
    token = text.Tokenizer()
    token.fit_on_texts(trainDF['text'])
    word_index = token.word_index
    print('Number of unique words:',len(word_index), "\n")
    
    return token, word_index

def convert_text(number_of_words, token, train_x, valid_x, test_x):

# This function converts the text to sequence of tokens and pad them till maxlen to ensure equal length vectors
    
    maxlen=number_of_words

    train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen)
    valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen)
    test_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_x), maxlen)
    print('convert text to tokens - Done! \n')

    return train_seq_x, valid_seq_x, test_seq_x

def load_language_model(fname):

# This function loads the language model

    data = {}
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())    
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    
    print("load_patentVec-Done! \n")
    
    return data

def create_embedding_matrix(embeddings_index, word_index):

# This function creates a token-embedding matrix
    
    num_words=len(word_index) + 1
    embedding_matrix = np.zeros((num_words, 300))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix, num_words

def create_bidirectional_lstm(maxlen, num_words, number_of_codes, embedding_matrix):
    
    # Add an input layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)
    
    # Add the spatial dropout layer
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)

    # Add a bi-directional layer
    lstm_layer = layers.Bidirectional(layers.LSTM(100, recurrent_dropout=0.1, dropout=0.1))(embedding_layer)

    # Add the output layer
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    
    return model

def kill_model():
    try:
        K.clear_session()
        del model
    except:
        print('No model to clear \n')    

In [4]:
def train(x_train, y_train, x_test, y_test, model, batch_size, epochs):
    history = model.fit(x_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(x_test, y_test),
            shuffle=True)
    
    return model, history

def evaluate(model, x_test, y_test):
    scores = model.evaluate(x_test, y_test, verbose=1)
    # print('Test loss:', scores[0])
    # print('Test accuracy:', scores[1])
    return scores

def predict(model, x_test):
    test_classes = model.predict(x_test, verbose=0)
    test_classes = np.argmax(test_classes, axis=-1)
    # print(test_classes.shape)
    return test_classes

def mostcommon(array):
    '''return the most common value of an array'''
    return np.bincount(array).argmax()

In [15]:
def adaboost(n_learners, epochs_lst, batch_size, sample_ratio=3, number_of_codes=731):

#This function creates the base classifiers and calculates their predictions
#Then, the function creates the adaboost ensemble classifier by keeping the probabilities from each base classifier
#and calculating the final predictions based on alphas and predictions from base classifiers

    num_classes = number_of_codes
    K = float(num_classes)

    #transform one-hot encoded codes to code numbers
    train_y_p1_help=np.argmax(train_y_p1,axis = -1)
    y_train_old = train_y_p1_help[:]
    test_y_p1_help=np.argmax(test_y_p1,axis = -1)
    y_test_old = test_y_p1_help[:] # save for error calculation
    
    #change the names of train_seq_x_p1, test_seq_x_p1, train_y_p1, test_y_p1
    y_train=train_y_p1
    y_test=test_y_p1
    x_train=train_seq_x_p1
    x_test=test_seq_x_p1
    
    n_trains = x_train.shape[0]
    n_tests = x_test.shape[0]
    
    #initislize needed elements
    weights = [1.0/n_trains for k in range(n_trains)]
    M = sample_ratio*n_trains # >> sample a large (>> m) unweighted set of instance according to p
    alphas = []
    test_accuracy_records = []
    probs = np.zeros((n_tests, num_classes))
    probs_2 = np.zeros((n_tests, n_learners), dtype="int64")
    
    for i in range(n_learners):
        
        #normalize deviding with sum_weights
        sum_weights = sum(weights)
        weights = [weight/sum_weights for weight in weights]
        
        if i ==0:
            # use the original dataset
            train_picks = np.arange(n_trains)
            x_train_i = x_train
            y_train_i = y_train
        else:
            # use the re-weighted train dataset
            train_picks = np.random.choice(n_trains, M, weights)
            x_train_i = x_train[train_picks, :]
            y_train_i = y_train[train_picks, :]

        epochs = epochs_lst[i]
        
        kill_model()
        model = create_bidirectional_lstm(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
        model, history = train(x_train_i, y_train_i, x_test, y_test, model, batch_size, epochs)

        #changes based on prediction results

        print("model " + str(i))
        predicts = predict(model, x_train_i)    
        y_ref = y_train_old[train_picks].reshape((M, ))        
        num_error = np.count_nonzero(predicts - y_ref)        
        error = float(num_error)/M

        alpha = np.log((1 - error)/error) + np.log(K - 1)
               
        w_changed = np.zeros(n_trains)
        for j in range(M):
            index = train_picks[j]
            if predicts[j] != y_ref[j] and w_changed[index] == 0:
                w_changed[index] = 1
                weights[index] = weights[index] * np.exp(alpha)

        sum_weights = sum(weights)
        weights = [weight/sum_weights for weight in weights]

        train_picks = np.random.choice(n_trains, M, weights)
        x_train_i = x_train[train_picks, :]
        y_train_i = y_train[train_picks, :]
        
        #save the alphas
        alphas.append(alpha)
        print("alpha = " + str(alpha))

        #save the evaluation score
        scores = evaluate(model, x_test, y_test)
        test_accuracy_records.append(scores[1])        
        print("accuracy evaluate= " + str(scores[1]))
        
        ''' return final_predict based on weighted_vote of all the learners in models
        weight is the the accuracy of each learner'''
    
        pred=model.predict(x_test)
        probs = probs + alpha*pred
           
        ''' return final_predict based on majority vote of all the learners in models'''

        probs_2[:, i] = predict(model, x_test) # each column stores one learner's prediction
    
    print("Final scores:")
    #final predict weighted_vote     
    final_predict = np.argmax(probs, axis=-1)
    errors = np.count_nonzero(final_predict.reshape((n_tests, )) - y_test_old.reshape((n_tests,)))
    
    #final predict majority vote     
    final_predict_2 = np.zeros((n_tests, 1), dtype="int64")
    for i in range(n_tests):
        final_predict_2[i] = mostcommon(probs_2[i, :])
    errors_2 = np.count_nonzero(final_predict_2.reshape((n_tests, )) - y_test_old.reshape((n_tests,)))
    
    print('Adaboost ensemble - Accuracy based on weighted vote: %f' % ((n_tests - errors)/float(n_tests)))
    print('Adaboost ensemble - Accuracy based on majority vote: %f' % ((n_tests - errors_2)/float(n_tests)))

    for i in range(n_learners):
        print("Base classifiers/learners")
        print("learner %d (epochs = %d): %0.6f" % (i, epochs_lst[i], test_accuracy_records[i]))

    #Store final predictions
    #df=pd.DataFrame(probs)
    #df.sort_values(by=0, axis=1, ascending=False)
    #df.to_csv("F:/PhD/Datasets/ensemble/adaboost/predictions_adaboost_abstract_learners"+str(n_learners)+".csv", header=False, index=False)

    # Store rel
    #q_rel=y_test_old.reshape((n_tests,))
    #df_q_rel=pd.DataFrame(q_rel)
    #df_q_rel.to_csv('F:/PhD/Datasets/ensemble/adaboost/qrel_numbers_ada_abstract_5.csv', header=False, index=False, sep=',')


### Main code

In [16]:
number_of_words = 60
epochs_lst = [1, 1, 1]
batch_size = 128
n_learners = [3]#, 5, 7]
sample_ratio = 1

for i, learners in enumerate(n_learners):
    
    print(learners)
    
    '''trainDF_p1 = load_patents("F:/PhD/Datasets-Results/clefip/Datasets/I3_dataset_multilabel/abstract.csv")

    onehot_encoded, onehot_encoder=encode_labels(trainDF_p1)    
   
    number_of_codes=enumarate_codes(onehot_encoded)
    
    train_x_p1, train_y_p1, valid_x_p1, valid_y_p1, test_x_p1, test_y_p1, number_of_test_data_p1=split_dataset(trainDF_p1, onehot_encoded)

    token_p1, word_index_p1=tokenize_text(trainDF_p1)

    train_seq_x_p1, valid_seq_x_p1, test_seq_x_p1 =convert_text(number_of_words, token_p1, train_x_p1, valid_x_p1, test_x_p1)

    embeddings_index = load_language_model('F:/PhD/Datasets-Results/embeddings/patent-300.vec')
    embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1)
    #del embeddings_index'''
    
    adaboost(learners, epochs_lst, batch_size, sample_ratio, number_of_codes)

3
No model to clear 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 60)                0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 60, 300)           52227000  
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, 60, 300)           0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense_6 (Dense)              (None, 731)               146931    
Total params: 52,694,731
Trainable params: 467,731
Non-trainable params: 52,227,000
_________________________________________________________________
Train on 432904 samples, validate on 54113 samples
Epoch 1/1
model 0
alpha = 6.837898882534359
accurac