In [None]:
from sklearn.preprocessing import *
from sklearn.feature_extraction.text import *
import numpy as np
import pandas as pd
from tensorflow import keras
import scipy.sparse as sps
import tensorflow as tf
from tensorflow.keras.mixed_precision import experimental as mixed_precision

In [None]:
import os
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

In [None]:
# policy = mixed_precision.Policy('mixed_float16')
# mixed_precision.set_policy(policy)

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
from util import *

In [None]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [None]:
code_vectorizer = CountVectorizer(min_df=5, analyzer=lambda x: x)
code_vectors = code_vectorizer.fit_transform(training_set.cpc_codes)
test_code_vectors = code_vectorizer.transform(testing_set.cpc_codes)

# text_vectorizer = TfidfVectorizer(min_df=5)

# abstract_vectors = text_vectorizer.fit_transform(training_set.abstract)
# test_abstract_vectors = text_vectorizer.transform(testing_set.abstract)

# text_vectorizer = TfidfVectorizer(min_df=5)

# claim_vectors = text_vectorizer.fit_transform(training_set.claims)
# test_claim_vectors = text_vectorizer.transform(testing_set.claims)

# text_vectorizer = TfidfVectorizer(min_df=2, max_df=0.5)

# desc_vectors = text_vectorizer.fit_transform(training_set.description)
# test_desc_vectors = text_vectorizer.transform(testing_set.description)

In [None]:
subset = list(set(all_tiers_100)-set(["PersonalizedProduct"]))
subset

In [None]:
labels = training_set[subset].values.astype(int)
test_labels = testing_set[subset].values.astype(int)

In [None]:
labels.shape

In [None]:
from transformers import *
from tensorflow import keras

In [None]:
from tensorflow.keras import layers, models

In [None]:
from transformers import TFAutoModel, AutoTokenizer
from tqdm.auto import tqdm


# # def tokenize(sentences, tokenizer):
# #     input_ids, input_masks, input_segments = [],[],[]
# #     for sentence in tqdm(sentences):
# #         inputs = tokenizer.encode_plus(sentence,
# #                                        truncation=True,
# #                                        add_special_tokens=True, 
# #                                        max_length=256,
# #                                        padding=True,
# #                                        return_tensors='tf',
# #                                        #pad_to_max_length=True,
# #                                        return_attention_mask=False, 
# #                                        return_token_type_ids=False)
# #         input_ids.append(inputs['input_ids'])
# #         input_masks.append(inputs['attention_mask'])
# #         input_segments.append(inputs['token_type_ids'])        
        
#     return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')


In [None]:
#tokenizer = AutoTokenizer.from_pretrained("/var/patentmark/patentBERT/")
#transformer = TFAutoModel.from_pretrained("/var/patentmark/patentBERT/", from_pt=True)
#config = AutoConfig.from_pretrained("/var/patentmark/patentBERT/")

#tokenizer = AutoTokenizer.from_pretrained("./bertForPatents/")
#transformer = TFAutoModel.from_pretrained("./bertForPatents/")
#config = AutoConfig.from_pretrained("./bertForPatents/")
model_name = "/home/martin/IdeaProjects/phenetics/bertForPatents" #"johngiorgi/declutr-sci-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformer = TFAutoModel.from_pretrained(model_name, from_pt=True)
config = AutoConfig.from_pretrained(model_name)

In [None]:
max_length=128

In [None]:
claim_input_ids = tokenizer(text=training_set.claims.to_list(),
                            truncation=True,
                           add_special_tokens=True, 
                           max_length=max_length,
                           padding=True,
                           return_tensors='tf',
                           #pad_to_max_length=True,
                           return_attention_mask=False, 
                           return_token_type_ids=False)

In [None]:
test_claim_input_ids = tokenizer(text=testing_set.claims.to_list(),
                            truncation=True,
                           add_special_tokens=True, 
                           max_length=max_length,
                           padding=True,
                           return_tensors='tf',
                           #pad_to_max_length=True,
                           return_attention_mask=False, 
                           return_token_type_ids=False)

In [None]:
abstract_input_ids = tokenizer(text=training_set.abstract.to_list(),
                            truncation=True,
                           add_special_tokens=True, 
                           max_length=max_length,
                           padding=True,
                           return_tensors='tf',
                           #pad_to_max_length=True,
                           return_attention_mask=False, 
                           return_token_type_ids=False)

In [None]:
test_abstract_input_ids = tokenizer(text=testing_set.abstract.to_list(),
                            truncation=True,
                           add_special_tokens=True, 
                           max_length=max_length,
                           padding=True,
                           return_tensors='tf',
                           #pad_to_max_length=True,
                           return_attention_mask=False, 
                           return_token_type_ids=False)

In [None]:
import tensorflow_addons as tfa

In [None]:
# claim_input_ids, claim_masks, _ = tokenize(training_set.claims, tokenizer)
# test_claim_input_ids, test_claim_masks, _ = tokenize(testing_set.claims, tokenizer)

In [None]:
def create_model():
    #claim_input = layers.Input(shape=(claim_vectors.shape[1]))
    #description_input = layers.Input(shape=(desc_vectors.shape[1]))
    
    #codes_in = layers.Input(shape=(code_vectors.shape[1]), dtype='int32', name='code_vector')
    #code_embedding = layers.Dense(16)(codes_in)
    
    claim_ids_in = layers.Input(shape=(max_length,), dtype='int32', name="claim_tokens")
    abstract_ids_in = layers.Input(shape=(max_length,), dtype='int32', name="abstract_tokens")
    #claim_masks_in = layers.Input(shape=(claim_masks.shape[1]), dtype='int32')
    claim_embedding_layer = transformer(claim_ids_in)[1]
    abstract_embedding_layer = transformer(abstract_ids_in)[1]
    
    embedding_layers = layers.Concatenate()([abstract_embedding_layer
                                             , claim_embedding_layer
                                             #, code_embedding
                                            ])
    #, attention_mask=claim_masks_in)[0]
    #claim_embedding_layer = layers.GlobalAveragePooling1D()(claim_embedding_layer)
    
    #concat_layer = layers.Concatenate()([claim_input])
    #dropout = layers.Dropout(config.hidden_dropout_prob)(claim_embedding_layer, training=False)
    
    #abstract_input = layers.Input(shape=(abstract_vectors.shape[1]), name="abstract_vectors")
    #claim_input = layers.Input(shape=(claim_vectors.shape[1]), name="claim_vectors")
    
#     concat = layers.Concatenate()((abstract_input, claim_input))
#     dropout0 = layers.Dropout(0.7)(concat)
#     dense0 = layers.Dense(64, activation='elu')(dropout0)
#     dropout1 = layers.Dropout(0.7)(dense0)
    dropout = layers.Dropout(0.5)(embedding_layers)
    dense = layers.Dense(64, activation='elu')(dropout)
    output = layers.Dense(units=len(subset), activation='sigmoid')(dense)
    
                          #kernel_initializer=keras.initializers.TruncatedNormal(stddev=config.initializer_range))(dense0)
    
    #, activation='sigmoid')(dropout)
    
    optimizer = tfa.optimizers.AdamW(weight_decay=5e-5, learning_rate=1e-03, epsilon=1e-06, beta_1 = 0.9, beta_2=0.999, amsgrad=True)
    loss = keras.losses.BinaryCrossentropy()
    #loss = tfa.metrics.HammingLoss(mode='multilabel', threshold=0.5, name="hamming")
    #loss = tfa.losses.SigmoidFocalCrossEntropy()
    #loss = tfa.metrics.hamming
    
    #metric = keras.metrics.CategoricalAccuracy('accuracy')
    metric = tfa.metrics.HammingLoss(mode='multilabel', threshold=0.5)
    #metric = 'acc'
    
    net = models.Model([
                          abstract_ids_in
                          #, codes_in
                        , claim_ids_in
                       ], output)
    
    net.compile(loss=loss, metrics=[metric], optimizer=optimizer)
#     for layer in net.layers[:3]:
#         layer.trainable = False
    return net

In [None]:
model = create_model()

In [None]:
model.summary()

In [None]:
from datetime import datetime
from tensorflow.keras.callbacks import * 

logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
callbacks = [
        ReduceLROnPlateau(monitor='loss'),
        EarlyStopping(patience=5, monitor='val_loss'),
        ModelCheckpoint(filepath="keras-model", save_best_only=True),
        keras.callbacks.TensorBoard(log_dir=logdir),
        tfa.callbacks.TQDMProgressBar()
    ]

model.fit(x={   "code_vector": code_vectors.todense(),
                "abstract_tokens": abstract_input_ids['input_ids'],
                "claim_tokens": claim_input_ids['input_ids']}, 
          y=labels,
          verbose=2, 
          epochs=100, 
          validation_split=0.2, 
          batch_size=4,
          callbacks=callbacks)

In [None]:
labels[0]

In [None]:
preds = model.predict({"abstract_tokens": abstract_input_ids["input_ids"],
                       "code_vector": code_vectors.todense(),
                       "claim_tokens": claim_input_ids["input_ids"]})

In [None]:
thresholds = np.mean(preds, axis=0)

In [None]:
preds[preds>=thresholds] = 1
preds[preds<=thresholds] = 0

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(labels, preds, target_names=subset))

In [None]:
test_preds =model.predict({
                            "code_vector": test_code_vectors,
                           "abstract_tokens": test_abstract_input_ids["input_ids"], 
                           "claim_tokens": test_claim_input_ids["input_ids"]})

In [None]:
test_preds[test_preds>=thresholds] = 1
test_preds[test_preds<thresholds] = 0

In [None]:
print(classification_report(test_labels, test_preds, target_names=subset))