In [1]:
import pandas as pd
import yaml

In [2]:
df = pd.read_csv("mechanism_nlp_complete_withkeywords.csv")
df.head()


Unnamed: 0,patent_number,patent_title,patent_text,patent_keywords
0,RE30525,Extended range hydraulic transmission,Claims (1)\nHide Dependent \nI claim. 1. An ex...,"['said', 'means', 'hydraulic', 'shaft', 'outpu..."
1,RE30135,Electric fail-safe actuator,Claims (13)\nHide Dependent \nI claim: .[.1. A...,"['electric', 'said', 'motor', 'valve', 'actuat..."
2,RE29872,Differential gear mechanism,Claims (13)\nHide Dependent \nWhat is claimed ...,"['said', 'cam', 'cam member', 'differential ge..."
3,RE30334,Pressure compensated hermetically sealed trans...,Claims (15)\nHide Dependent \nWhat is claimed ...,"['transmission means', 'said', 'environment', ..."
4,RE30120,Lobe type pump adjustment,Claims (2)\nHide Dependent \nI claim: .[.1. A ...,"['hub', 'said', 'abutment', 'shaft', 'spring',..."


In [3]:
keywords = df["patent_keywords"].map(yaml.safe_load).to_list()

In [4]:
!nvidia-smi

Tue Aug  3 04:00:25 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    35W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
%%time
%%capture
!pip install transformers

CPU times: user 26.1 ms, sys: 7.4 ms, total: 33.5 ms
Wall time: 2.63 s


In [6]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.9.0+cu102


In [7]:
# Configurations
DEBUG           = False

INPUT_DIR       = 'articles'

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 768  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 8
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32
# EPOCHS          = 10!!!!!50 100 
EPOCHS          = 20
LR              = 5e-3
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [8]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [9]:
data = dict()
for patent_number, patent_title, patent_text, patent_keywords in df.itertuples(index=False):
    data[patent_number]=[patent_title, patent_text, yaml.safe_load(patent_keywords)] 


print(f"Number of articles: {len(data) :,}")

Number of articles: 50


In [10]:
( data.get('RE30525')[2])

['said',
 'means',
 'hydraulic',
 'shaft',
 'output',
 'intermediate shaft',
 'input',
 'planetary gearing',
 'planetary',
 'hydraulic elements',
 'gearing',
 'input shaft',
 'disengageable']

In [11]:
%%time

all_keywords = set()
for v in keywords:
    for w in v:
        all_keywords.add(w)


print(f"Number of unique keywords: {len(all_keywords) :,}")  

Number of unique keywords: 440
CPU times: user 624 µs, sys: 0 ns, total: 624 µs
Wall time: 542 µs


In [12]:
class myDataset(Dataset):

    def __init__(self, data, tokenizer, randomize=True):

        title, text, keywords = [], [], []
#       title, text, keywords can be cat, dog, elephant 
        for k, v in data.items():
            title.append(v[0])
            text.append(v[1])
            keywords.append(v[2])

        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.title     = title
        self.text      = text
        self.keywords  = keywords  

    #---------------------------------------------#

    @staticmethod
    def join_keywords(keywords, randomize=True):
        N = len(keywords)

        #random sampling and shuffle
        if randomize: 
            M = random.choice(range(N+1))
            keywords = keywords[:M]
            random.shuffle(keywords)

        return ','.join(keywords)

    #---------------------------------------------#

    def __len__(self):
        return len(self.text)

    #---------------------------------------------#
#   get item is to get each row
    def __getitem__(self, i):
        keywords = self.keywords[i].copy()
        kw = self.join_keywords(keywords, self.randomize)
        
        input = SPECIAL_TOKENS['bos_token'] + self.title[i] + \
                SPECIAL_TOKENS['sep_token'] + kw + SPECIAL_TOKENS['sep_token'] + \
                self.text[i] + SPECIAL_TOKENS['eos_token']

        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}
# tensor is multi-dimension of data. converting words to numbers, then to tensors

In [13]:
def split_data(data, S=TRAIN_SIZE):
    # Shuffle ids
    ids = list(data.keys())
    random.shuffle(ids)

    # Split into training and validation sets    
    train_size = int(S * len(data))

    train_ids = ids[:train_size]
    val_ids = ids[train_size:]

    train_data = dict()
    for id in train_ids:
        train_data[id] = data[id]

    val_data = dict()
    for id in val_ids:
        val_data[id] = data[id]

    return train_data, val_data

In [14]:
# Loading Tokenizer, Config and Model under transformer. Model itself like random forest!!!!!!!

def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

In [15]:
%%time

tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                #   load_model_path='pytorch_model.bin'
                 )

Special tokens added
CPU times: user 4.24 s, sys: 1.9 s, total: 6.13 s
Wall time: 9.47 s


In [16]:
# - Freeze selective layers:
# - Freeze all layers except last n:
# Configurations- refer!!!!!!
# MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl} !!!!!!!!!

for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

In [17]:
train_data, val_data = split_data(data)

train_dataset = myDataset(train_data, tokenizer)
val_dataset = myDataset(val_data, tokenizer, randomize=False)

f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

'There are 40 samples for training, and 10 samples for validation testing'

In [18]:
%%time

training_args = TrainingArguments(
    output_dir="/content/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,     
)

#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()
trainer.save_model()    

# model.fit in sklearn !!!!

Using amp fp16 backend
***** Running training *****
  Num examples = 40
  Num Epochs = 20
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
  Total optimization steps = 20
  args.max_grad_norm,


Epoch,Training Loss,Validation Loss
0,No log,78.889557
1,No log,78.889557
2,No log,78.889557
3,No log,78.889557
4,No log,78.889557
5,No log,64.905167
6,No log,46.910324
7,No log,28.566193
8,No log,6.270707
9,No log,3.492936


***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to /content/checkpoint-1
Configuration saved in /content/checkpoint-1/config.json
Model weights saved in /content/checkpoint-1/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-1/tokenizer_config.json
Special tokens file saved in /content/checkpoint-1/special_tokens_map.json
  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to /content/checkpoint-2
Configuration saved in /content/checkpoint-2/config.json
Model weights saved in /content/checkpoint-2/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-2/tokenizer_config.json
Special tokens file saved in /content/checkpoint-2/special_tokens_map.json
Deleting older checkpoint [/content/checkpoint-4] due to args.save_total_limit
  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to /content

CPU times: user 2min 28s, sys: 34.3 s, total: 3min 2s
Wall time: 8min 25s


In [19]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  load_model_path='pytorch_model.bin')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task

Special tokens added


loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50257,
  "embd_pdrop": 0.1,
  "eos_token_id": 50258,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50260,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "sep_token_id": 50261,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation":

In [20]:
title = "Pipe join in deap sea"
keywords = ['joint', 'force', 'flexible', 'rotate', 'connecting', 'stress']
kw = myDataset.join_keywords(keywords, randomize=False)

prompt = SPECIAL_TOKENS['bos_token'] + title + \
         SPECIAL_TOKENS['sep_token'] + kw + SPECIAL_TOKENS['sep_token']
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();
# similar to model.fit

In [21]:
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=0.9,
                                repetition_penalty=2.0,
                                num_return_sequences=10
                                )
# similar to model.predict
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(title) + len(','.join(keywords))    
    print("{}: {}\n\n".format(i+1,  text[a:]))

1: .
I. A small-scale engagement with the environment is not enough to. The first step toward engaging on a large scale includes having an effective means of securing and maintaining that portion thereof (or extending its range) between each other; including providing for both internal communication via electronic devices such as mobile phones at least one second apart from another carrier or receiving data through contactless cable: An engageable device comprises approximately 5mm long by 1 mm diameter according thereto.[6] This extends along two surfaces when engaged therewith[7]. In accordance solyaially said surface can include multiple elements comprising either external support components which are disposed together within common space,[8], independent control mechanisms wherein input into their respective portions constitutes substantially equal weight relative generally towards fixed positions around adjacent spaces respectively—including transmitting signals directly upon move