In [2]:
#tensorflow 1.11.0
#keras 2.3.1
#python 3.6.13

### Import libraries

In [3]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import LSTM, GRU, Bidirectional, Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import SGD, RMSprop, Adadelta, Adam
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, LearningRateScheduler, EarlyStopping
import numpy as np
import pandas as pd
import io
from statistics import mean, stdev, median
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import shuffle
from collections import Counter
import math
import matplotlib.pyplot as plt
import csv

from tensorflow.keras.layers import LSTM, GRU, Bidirectional, 


### Create some help functions 

### Load csv file

In [4]:
def load_patents_with_header(file1):
    trainDF = pd.read_csv(file1, header=0, usecols=[0,1])
    #trainDF['text']=trainDF['text'].fillna("")
    return trainDF

def load_patents(file1):
    trainDF = pd.read_csv(file1, header=None, usecols=[0,1])
    trainDF=trainDF.rename(columns={0: 'label'})
    trainDF=trainDF.rename(columns={1: 'text'})
    return trainDF

### Remove patents based on a condition

In [6]:
#remove codes used in patents over a threshold
def remove_rare_codes1(trainDF, down_threshold, up_threshold):
    my_list=Counter(trainDF['label'])
    my_list_copy=Counter(trainDF['label'])


    for code, freq in my_list.items():
        if  freq > down_threshold:
            my_list_copy.pop(code, freq)

    print(my_list_copy)

    labels_copy, texts_copy = [], []

    for code, freq in my_list_copy.items():
        mm=0
        for label in trainDF['label']:
            mm=mm+1
            if label==code:
                labels_copy.append(code)
                texts_copy.append(trainDF['text'][mm-1])
                
    trainDF = pd.DataFrame()
    trainDF['text'] = texts_copy
    trainDF['label'] = labels_copy
    
    shuffle(trainDF)
    print(trainDF.shape)
    
    my_list_validation=Counter(trainDF['label'])
    print(my_list_validation) 
    
    return trainDF

In [8]:
#remove codes used in patents under a threshold
def remove_rare_codes_without_up(trainDF, down_threshold, up_threshold):
    my_list=Counter(trainDF['label'])
    my_list_copy=Counter(trainDF['label'])


    for code, freq in my_list.items():
        if  freq < down_threshold:
            my_list_copy.pop(code, freq)

    print(my_list_copy)

    labels_copy, texts_copy = [], []

    for code, freq in my_list_copy.items():
        mm=0
        for label in trainDF['label']:
            mm=mm+1
            if label==code:
                labels_copy.append(code)
                texts_copy.append(trainDF['text'][mm-1])
                
    trainDF = pd.DataFrame()
    trainDF['text'] = texts_copy
    trainDF['label'] = labels_copy
    
    shuffle(trainDF)
    print(trainDF.shape)
    
    my_list_validation=Counter(trainDF['label'])
    print(my_list_validation) 
    
    return trainDF

### Encode the labels

In [9]:
def encode_labels(trainDF):

    labels_val=trainDF['label'].values

    onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
    onehot_encoded = onehot_encoder.fit_transform(labels_val.reshape(-1, 1))
    print("Example of an encoded label/target \n IPC code: ", labels_val[0], "\n", "One-hot encoding:", onehot_encoded[0], "\n")
    
    return onehot_encoded, onehot_encoder

In [10]:
def encode_labels_new(trainDF, trainDF_test):
    #initial    
    labels_val=trainDF['label'].values
    #new
    labels_val_test=trainDF_test['label'].values

    labels_all = np.hstack((labels_val, labels_val_test))

    onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
    onehot_encoded_original = onehot_encoder.fit_transform(labels_all.reshape(-1, 1))

    orio=len(labels_val)
    orio2=len(labels_all)
    
    onehot_encoded=onehot_encoded_original[0:orio, :]
    onehot_encoded_test=onehot_encoded_original[orio:orio2, :]

    return onehot_encoder, onehot_encoded, onehot_encoded_test

### Find the number of IPC codes

In [11]:
def enumarate_codes(onehot_encoded):

    number_of_codes=np.shape(onehot_encoded)
    number_of_codes=number_of_codes[1]
    print("Number of ipc codes: ", number_of_codes, "\n")
    
    return number_of_codes

### Split the dataset to train, validation and test data (80/10/10)

In [12]:
def split_dataset(trainDF, onehot_encoded):

    train_x, valid_x, train_y, valid_y = train_test_split(trainDF['text'], onehot_encoded, test_size=0.2, random_state=42)#, stratify=onehot_encoded)
    test_x, valid_x, test_y, valid_y = train_test_split(valid_x, valid_y, test_size=0.5, random_state=41)
    print("split_abstract_dataset-Done! \n")
        
    #Number of data per split
    number_of_train_data=np.shape(train_x)
    number_of_train_data=number_of_train_data[0]
    print("Number of train data:", number_of_train_data)

    number_of_valid_data=np.shape(valid_x)
    number_of_valid_data=number_of_valid_data[0]
    print("Number of validation data:",number_of_valid_data)

    number_of_test_data=np.shape(test_x)
    number_of_test_data=number_of_test_data[0]
    print("Number of test data:",number_of_test_data, "\n")
    
    return train_x, train_y,  valid_x, valid_y, test_x, test_y, number_of_test_data

### Text tokenization

In [14]:
def tokenize_text(trainDF):
        
    token = text.Tokenizer()
    token.fit_on_texts(trainDF['text'])
    word_index = token.word_index
    print('Number of unique words:',len(word_index), "\n")
    
    return token, word_index

### Convert text to sequence of tokens and pad them to ensure equal length vectors

In [15]:
def convert_text(number_of_words, token, train_x, valid_x, test_x):
    
    maxlen=number_of_words

    train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen)
    valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen)
    test_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_x), maxlen)
    print('convert text to tokens - Done! \n')

    return train_seq_x, valid_seq_x, test_seq_x

### Text representation - Load a pre-trained word embedding

We used the FastText and Patent-300 [1] 
The original function is provided by [fasttext.cc](https://fasttext.cc/docs/en/english-vectors.html) and corrected based on https://github.com/facebookresearch/fastText/issues/882

[1] Risch, J., & Krestel, R. (2019). Domain-specific word embeddings for patent classification. Data Technologies and Applications, 53(1), 108-122. (The Patent-300 embedding was provided by the authors)

In [16]:
def load_fasttext(fname):
    data = {}
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())    
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    print("load_patentVec-Done! \n")
    
    return data

### Create a token-embedding mapping / an embedding matrix

In [17]:
def create_embedding_matrix(embeddings_index, word_index):
    
    num_words=len(word_index) + 1
    embedding_matrix = np.zeros((num_words, 300))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix, num_words

### Kill the model

In [18]:
def kill_model():
    try:
        K.clear_session()
        del model
    except:
        print('No model to clear \n')

### Define a Bi-LSTM classifier

In [20]:
def create_bidirectional_lstm(maxlen, num_words, number_of_codes, embedding_matrix):
    
    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(num_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.1)(embedding_layer)

    # Add a bi-directional layer
    lstm_layer = layers.Bidirectional(layers.LSTM(100, recurrent_dropout=0.1, dropout=0.1))(embedding_layer)

    # Add the output Layers
    # output_layer1 = layers.Dropout(0.25)(lstm_layer)
    output_layer2 = layers.Dense(number_of_codes, activation="softmax")(lstm_layer)
 
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    
    return model

### Predictions on test data

In [24]:
def make_predictions(test_seq_x, test_y, classifier):
    
    predictions = classifier.predict(test_seq_x)
    prediction = np.argmax(predictions, axis = -1) 
    y_true = np.argmax(test_y,axis = -1)
    print('make_predictions-Done! \n')

    return predictions, prediction, y_true

### Calculate the metrics

In [31]:
def calculate_metrics(predictions, prediction, y_true, number_of_test_data):
    
    #Accuracy 
    accuracy_total=metrics.accuracy_score(prediction, y_true)*100
    print("Accuracy:", accuracy_total)

    #MRR, P@3, P@5, P@10
    all_rr=[]
    number_of_top_three=0
    number_of_top_five=0
    number_of_top_ten=0

    predictions_2=predictions.argsort()
    predictions_3=np.fliplr(predictions_2)
    for i in range (0, number_of_test_data):
        specific_prediction=predictions_3[i,:]
        list1 = specific_prediction.tolist()
        target=y_true[i]
        prediction_rank=list1.index(target)+1 
        #MRR
        RR=1/prediction_rank
        all_rr.append(RR)
        #P@3
        if prediction_rank<= 3:
            number_of_top_three=number_of_top_three+1
        #P@5
        if prediction_rank<= 5:
            number_of_top_five=number_of_top_five+1     
        #P@10
        if prediction_rank<= 10:
            number_of_top_ten=number_of_top_ten+1 
    MRR=np.mean(all_rr)
    print("MRR:", MRR)
    P3=number_of_top_three/number_of_test_data*100
    print("P@3:", P3)
    P5=number_of_top_five/number_of_test_data*100
    print("P@5:", P5)
    P10=number_of_top_ten/number_of_test_data*100
    print("P@10:", P10)
    
    return accuracy_total, MRR, P3, P5, P10

# Main code - Loop for different number of words and removing codes thresholds

In [None]:
#words = [61, 101, 201, 301, 401, 501]
words = [60]
epochs = 15
batch_size = 128
number_of_dataset = [1,2]

for i, number_of_words in enumerate(words):
    for i, dataset in enumerate(number_of_dataset):        
        if dataset==1:            
            
            #load the file and remove the codes used in less than 500 patents
            trainDF = load_patents("abstract.csv")  #https://github.com/ekamater/CLEFIP-0.54M 
            trainDF_p1=remove_rare_codes_without_up(trainDF, 500, 0)

            #load the file with the removed codes
            #trainDF_p1 = load_patents_with_header('500_clefip.csv')
            
            #suffle and reset the index
            idx = np.random.permutation(trainDF_p1.index)
            trainDF_p1=trainDF_p1.reindex(idx)
            trainDF_p1=trainDF_p1.reset_index(drop=True)
            
            #store the file with the removed codes
            #trainDF_p1.to_csv('500_clefip.csv', index=False)  
            
            #encode the labels
            onehot_encoder, onehot_encoded_initial, onehot_encoded =encode_labels_new(trainDF, trainDF_p1) 
            number_of_codes=enumarate_codes(onehot_encoded)
            
            #split, tokenize, convert and pad the text of patents with the removed codes
            train_x_p1, train_y_p1, valid_x_p1, valid_y_p1, test_x_p1, test_y_p1, number_of_test_data_p1=split_dataset(trainDF_p1, onehot_encoded)
            token_p1, word_index_p1=tokenize_text(trainDF)
            train_seq_x_p1, valid_seq_x_p1, test_seq_x_p1 =convert_text(number_of_words, token_p1, train_x_p1, valid_x_p1, test_x_p1)

            #split, convert and pad the text of all patents 
            train_x, train_y, valid_x, valid_y, test_x, test_y, number_of_test_data=split_dataset(trainDF, onehot_encoded_initial)
            train_seq_x, valid_seq_x, test_seq_x =convert_text(number_of_words, token_p1, train_x, valid_x, test_x)     
    
            #create the embedding matrix
            #embeddings_index = load_fasttext('patent-300.vec')
            embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1)
            
            #create the classifier
            kill_model()
            classifier1 = create_bidirectional_lstm(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
            
            #train the classifier
            earlystop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)            
            filepath="weights.best.hdf5"
            checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
            
            history1=classifier1.fit(train_seq_x_p1, train_y_p1, \
                                     validation_data = (valid_seq_x_p1, valid_y_p1), \
                                     callbacks=[earlystop, checkpoint], \
                                     epochs=epochs, \
                                     batch_size=batch_size, \
                                     verbose=1)
            
            #save the trained classifier
            #classifier1.save("abstract_500_clefip")
            
            #predictions for testing patents with the removed codes
            predictions_p1, prediction_p1, y_true1=make_predictions(test_seq_x_p1, test_y_p1, classifier1)
            accuracy_total1, MRR1, P3_1, P5_1, P10_1 = calculate_metrics(predictions_p1, prediction_p1, y_true1, number_of_test_data_p1)

            #predictions for all patents 
            predictions, prediction, y_true=make_predictions(test_seq_x, test_y, classifier1)
            accuracy_total, MRR, P3, P5, P10 = calculate_metrics(predictions, prediction, y_true, number_of_test_data)
        
            #store the results
            #df=pd.DataFrame(predictions)
            #df.sort_values(by=0, axis=1, ascending=False)
            #df.to_csv('predictions_500_all.csv', header=False, index=False)          
            
            #df_p1=pd.DataFrame(predictions_p1)
            #df_p1.sort_values(by=0, axis=1, ascending=False)
            #df_p1.to_csv('predictions_500_part.csv', header=False, index=False)          
            
            #df_q_rel=pd.DataFrame(y_true)
            #df_q_rel.to_csv('qrel_500_all.csv', header=False, index=False, sep=',') 
            
        if dataset==2:            

            #load the file and remove the codes used in more than 500 patents
            trainDF = load_patents("abstract.csv")
            trainDF_p2=remove_rare_codes1(trainDF, 500, 0)

            #load the file with the removed codes
            #trainDF_p2 = load_patents_with_header('0-500_clefip.csv')
            
            #suffle and reset the index
            idx = np.random.permutation(trainDF_p2.index)
            trainDF_p2=trainDF_p2.reindex(idx)
            trainDF_p2=trainDF_p2.reset_index(drop=True)
            
            #store the file with the removed codes
            #trainDF_p2.to_csv('0-500_clefip.csv', index=False)  
            
            #encode the labels
            onehot_encoder, onehot_encoded_initial, onehot_encoded =encode_labels_new(trainDF, trainDF_p2) 
            number_of_codes=enumarate_codes(onehot_encoded)
            
            #split, tokenize (use the same tokenizer), convert and pad the text of patents with the removed codes
            train_x_p2, train_y_p2, valid_x_p2, valid_y_p2, test_x_p2, test_y_p2, number_of_test_data_p2=split_dataset(trainDF_p2, onehot_encoded)
            train_seq_x_p2, valid_seq_x_p2, test_seq_x_p2 =convert_text(number_of_words, token_p1, train_x_p2, valid_x_p2, test_x_p2)

            #split, convert and pad the text of all patents 
            train_x, train_y, valid_x, valid_y, test_x, test_y, number_of_test_data=split_dataset(trainDF, onehot_encoded_initial)
            train_seq_x, valid_seq_x, test_seq_x =convert_text(number_of_words, token_p1, train_x, valid_x, test_x)     
    
            #create the embedding matrix
            #embeddings_index = load_fasttext('patent-300.vec')
            #embedding_matrix_p1, num_words_p1 =create_embedding_matrix(embeddings_index, word_index_p1)
            
            #create the classifier
            kill_model()
            classifier2 = create_bidirectional_lstm(number_of_words, num_words_p1, number_of_codes, embedding_matrix_p1) 
            
            #train the classifier
            earlystop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)            
            filepath="weights.best.hdf5"
            checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
            
            history2=classifier2.fit(train_seq_x_p2, train_y_p2, \
                                     validation_data = (valid_seq_x_p2, valid_y_p2), \
                                     callbacks=[earlystop, checkpoint], \
                                     epochs=epochs, \
                                     batch_size=batch_size, \
                                     verbose=1)
            
            #save the trained classifier
            #classifier2.save("abstract_0-500_clefip")
            
            #predictions for testing patents with the removed codes
            predictions_p2, prediction_p2, y_true2=make_predictions(test_seq_x_p2, test_y_p2, classifier2)
            accuracy_total2, MRR2, P3_2, P5_2, P10_2 = calculate_metrics(predictions_p2, prediction_p2, y_true2, number_of_test_data_p2)

            #predictions for all patents 
            predictions_, prediction_, y_true_=make_predictions(test_seq_x, test_y, classifier2)
            accuracy_total_, MRR_, P3_, P5_, P10_ = calculate_metrics(predictions_, prediction_, y_true_, number_of_test_data)
        
            #store the results
            #df=pd.DataFrame(predictions_)
            #df.sort_values(by=0, axis=1, ascending=False)
            #df.to_csv('predictions_0-500_all.csv', header=False, index=False)          
            
            #df_p1=pd.DataFrame(predictions_p2)
            #df_p1.sort_values(by=0, axis=1, ascending=False)
            #df_p1.to_csv('predictions_0-500_part.csv', header=False, index=False)          
            
            #df_q_rel=pd.DataFrame(y_true_)
            #df_q_rel.to_csv('qrel_0-500_all.csv', header=False, index=False, sep=',') 


In [35]:
            #predictions for testing patents with the removed codes
            predictions_p2, prediction_p2, y_true2=make_predictions(test_seq_x_p2, test_y_p2, classifier2)
            accuracy_total2, MRR2, P3_2, P5_2, P10_2 = calculate_metrics(predictions_p2, prediction_p2, y_true2, number_of_test_data_p2)

            #predictions for all patents 
            predictions_, prediction_, y_true_=make_predictions(test_seq_x, test_y, classifier2)
            accuracy_total_, MRR_, P3_, P5_, P10_ = calculate_metrics(predictions_, prediction_, y_true_, number_of_test_data)
        


make_predictions-Done! 

Accuracy: 2.0764864057364805
MRR: 0.07450322711977107
P@3: 6.76725425754407
P@5: 10.531819539886465
P@10: 16.821033761577535
make_predictions-Done! 

Accuracy: 0.20512630975920756
MRR: 0.010233329694180957
P@3: 0.7484338329052168
P@5: 1.2307578585552454
P@10: 2.0457191432742596


In [36]:
''' stacking with probabilities
meta_x_train contains all probalilities for all labels and
meta_y_train contains one hot encoding for all labels'''

n_learners = 2
num_classes = number_of_codes # from the dataset
       
n_trains = train_seq_x.shape[0]
n_tests = test_seq_x.shape[0]
    
test_accuracy_records = []
    
meta_x_train = np.zeros((n_trains, n_learners*num_classes), dtype="float32")
meta_x_test = np.zeros((n_tests, n_learners*num_classes), dtype="float32")

for i in range(n_learners):
            
    if i==0:
        meta_x_train[:, i*num_classes:i*num_classes + num_classes] = classifier1.predict(train_seq_x, verbose=0)
        meta_x_test[:, i*num_classes:i*num_classes + num_classes] = classifier1.predict(test_seq_x, verbose=0)
                 
    elif i==1:
        meta_x_train[:, i*num_classes:i*num_classes + num_classes] = classifier2.predict(train_seq_x, verbose=0)
        meta_x_test[:, i*num_classes:i*num_classes + num_classes] = classifier2.predict(test_seq_x, verbose=0)                   

    else:
        break
           
    # construct meta learning problem        
    meta_y_train = train_y # use one hot encode
    meta_y_test = test_y

In [None]:
#meta_model_softmax

def meta_model_softmax(n_learners, num_classes):
    '''create a feedforward model to train the meta model'''
    # create model
    model = Sequential()
    in_dim = n_learners * num_classes
    model.add(Dense(n_learners*num_classes, input_dim = in_dim, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

meta_epochs = 20

kill_model()
super_model = meta_model_softmax(n_learners, num_classes)     
super_model.fit(meta_x_train, meta_y_train, \
                batch_size=128, \
                epochs=meta_epochs, \
                validation_data=(meta_x_test, meta_y_test), \
                shuffle=True)
    
scores_softmax = super_model.evaluate(meta_x_test, meta_y_test, verbose=1)
print('Stack test accuracy: ', scores_softmax[1]) 
    
probs_softmax=super_model.predict(meta_x_test)
predict_softmax = np.argmax(probs_softmax, axis=-1)
y_true= np.argmax(meta_y_test, axis=-1)
accuracy_total, MRR, P3, P5, P10 = calculate_metrics(probs_softmax, predict_softmax, y_true, number_of_test_data)

#store the predictions
df=pd.DataFrame(probs_softmax)
df.sort_values(by=0, axis=1, ascending=False)
df.to_csv('predictions_stack_abstract_clefip_softmax.csv', header=False, index=False)          

No model to clear 

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 1462)              2138906   
_________________________________________________________________
dense_8 (Dense)              (None, 731)               1069453   
Total params: 3,208,359
Trainable params: 3,208,359
Non-trainable params: 0
_________________________________________________________________
Train on 432904 samples, validate on 54113 samples
Epoch 1/20

In [None]:
#meta_model_sigmoid

def meta_model_sigmoid(n_learners, num_classes):
    '''create a feedforward model to train the meta model'''
    # create model
    model = Sequential()
    in_dim = n_learners * num_classes
    model.add(Dense(n_learners*num_classes, input_dim = in_dim, activation='relu'))
    model.add(Dense(num_classes, activation='sigmoid'))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

meta_epochs = 20

kill_model()
super_model = meta_model_sigmoid(n_learners, num_classes) 
super_model.fit(meta_x_train, meta_y_train, batch_size=128, epochs=meta_epochs, validation_data=(meta_x_test, meta_y_test), shuffle=True)

scores_sigmoid = super_model.evaluate(meta_x_test, meta_y_test, verbose=1)
print('Stack test accuracy: ', scores_sigmoid[1])
       
probs_sigmoid=super_model.predict(meta_x_test)
predict_sigmoid = np.argmax(probs_sigmoid, axis=-1)
y_true= np.argmax(meta_y_test, axis=-1)

accuracy_total, MRR, P3, P5, P10 = calculate_metrics(probs_sigmoid, predict_sigmoid, y_true, number_of_test_data) 

#store the predictions   
df=pd.DataFrame(probs_sigmoid)
df.sort_values(by=0, axis=1, ascending=False)
df.to_csv('predictions_stack_title_clefip_sigmoid.csv', header=False, index=False)          
