In [1]:
import pandas as pd
import re
import sys
sys.path.append("../")
from clean import TextCleaner

from transformers import BertPreTrainedModel, BertConfig, BertModel
from transformers import BertTokenizer
import torch
import math
import numpy as np
from patent_utils import *
import gc
from encode import BERTEncoder
from BERTSimilarity import DocumentBert, BERTSimilarityTrainer
from torchviz import make_dot

In [2]:
# Load the dataset into a pandas dataframe.
df = pd.read_csv("../../data/patent-tsd-matches-11102020.csv")

patent_documents, patent_test = split_train_test(df.patentClaim1.values)
tsd_documents, tsd_test = split_train_test(df.TSDSection.values)
labels, labels_test = split_train_test(df.match.values)

In [3]:
bertEncoder = BERTEncoder(patent_documents, tsd_documents, labels, PatentCleaner())
patent_representations, tsd_representations, correct_output, tsd_dict = bertEncoder.tokenize_train_data()
patent_representations.shape, tsd_representations.shape, correct_output.shape

(torch.Size([2660, 3, 3, 512]),
 torch.Size([2660, 1, 3, 512]),
 torch.Size([2660]))

In [4]:
bertSimilarityTrainer = BERTSimilarityTrainer(patent_documents, tsd_documents, labels, threshold=0.8)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing DocumentBert: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.l

Some weights of DocumentBert were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.bert_patent.embeddings.word_embeddings.weight', 'bert.bert_patent.embeddings.position_embeddings.weight', 'bert.bert_patent.embeddings.token_type_embeddings.weight', 'bert.bert_patent.embeddings.LayerNorm.weight', 'bert.bert_patent.embeddings.LayerNorm.bias', 'bert.bert_patent.encoder.layer.0.attention.self.query.weight', 'bert.bert_patent.encoder.layer.0.attention.self.query.bias', 'bert.bert_patent.encoder.layer.0.attention.self.key.weight', 'bert.bert_patent.encoder.layer.0.attention.self.key.bias', 'bert.bert_patent.encoder.layer.0.attention.self.value.weight', 'bert.bert_patent.encoder.layer.0.attention.self.value.bias', 'bert.bert_patent.encoder.layer.0.attention.output.dense.weight', 'bert.bert_patent.encoder.layer.0.attention.output.dense.bias', 'bert.bert_patent.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.bert_patent.encoder.layer.0.a

In [5]:
bertSimilarityTrainer.train()

     100%|██████████| 333/333 [02:56<00:00,  1.89it/s]


Train: Epoch 0, Loss=0.655116, Train accuracy=37.56%, Test accuracy=37.23%


       0%|          | 0/333 [00:00<?, ?it/s]

Current best model saved as models/22-11-2020.12.38.14_BERTsimilaritymodel.pth at epoch 0.
37.234042553191486 37.234042553191486 0


     100%|██████████| 333/333 [02:56<00:00,  1.88it/s]


Train: Epoch 1, Loss=0.493375, Train accuracy=41.82%, Test accuracy=85.64%


       0%|          | 0/333 [00:00<?, ?it/s]

Current best model saved as models/22-11-2020.12.38.14_BERTsimilaritymodel.pth at epoch 1.
85.63829787234043 85.63829787234043 0


     100%|██████████| 333/333 [02:56<00:00,  1.88it/s]
       0%|          | 0/333 [00:00<?, ?it/s]

Train: Epoch 2, Loss=0.385283, Train accuracy=53.45%, Test accuracy=82.98%
82.97872340425532 85.63829787234043 1


     100%|██████████| 333/333 [02:56<00:00,  1.88it/s]
       0%|          | 0/333 [00:00<?, ?it/s]

Train: Epoch 3, Loss=0.335095, Train accuracy=59.48%, Test accuracy=84.04%
84.04255319148936 85.63829787234043 2


     100%|██████████| 333/333 [02:56<00:00,  1.88it/s]
       0%|          | 0/333 [00:00<?, ?it/s]

Train: Epoch 4, Loss=0.307702, Train accuracy=63.50%, Test accuracy=82.98%
82.97872340425532 85.63829787234043 3


     100%|██████████| 333/333 [02:56<00:00,  1.88it/s]


Train: Epoch 5, Loss=0.296020, Train accuracy=66.22%, Test accuracy=86.17%


       0%|          | 0/333 [00:00<?, ?it/s]

Current best model saved as models/22-11-2020.12.38.14_BERTsimilaritymodel.pth at epoch 5.
86.17021276595744 86.17021276595744 0


     100%|██████████| 333/333 [02:56<00:00,  1.88it/s]
       0%|          | 0/333 [00:00<?, ?it/s]

Train: Epoch 6, Loss=0.276879, Train accuracy=68.59%, Test accuracy=76.60%
76.59574468085107 86.17021276595744 1


     100%|██████████| 333/333 [02:56<00:00,  1.88it/s]
       0%|          | 0/333 [00:00<?, ?it/s]

Train: Epoch 7, Loss=0.250396, Train accuracy=70.46%, Test accuracy=79.26%
79.25531914893617 86.17021276595744 2


     100%|██████████| 333/333 [02:56<00:00,  1.88it/s]
       0%|          | 0/333 [00:00<?, ?it/s]

Train: Epoch 8, Loss=0.230765, Train accuracy=72.10%, Test accuracy=75.00%
75.0 86.17021276595744 3


     100%|██████████| 333/333 [02:56<00:00,  1.88it/s]


Train: Epoch 9, Loss=0.223563, Train accuracy=73.58%, Test accuracy=79.79%
79.7872340425532 86.17021276595744 4
Early stopping


In [7]:
bertSimilarityTrainer.model

DocumentBert(
  (bert_patent): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin