In [4]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
import torchtext
from torch import nn
from torchtext.data import Field, LabelField
from torchtext.data import BucketIterator
from torchtext.datasets import IMDB
from transformers import BertTokenizer
from transformers import BertModel
from skorch import NeuralNetClassifier
from skorch.callbacks import Freezer
from skorch.callbacks import ProgressBar
import datasets
import nlp
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [5]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [7]:
training_set.all_tiers_100

0      SpecificationofUse_JointReplacement, Specifica...
1                          PersonalizedProduct_Guide/Jig
2      SpecificationofUse_JointReplacement, Specifica...
3      AnatomicalTarget_UpperExtremity, AnatomicalTar...
4      AnatomicalTarget_LowerExtremity_Knee, Anatomic...
                             ...                        
967    AnatomicalTarget_UpperExtremity, AnatomicalTar...
968    Manufacturing_AdditiveManufacturing, Manufactu...
969    PersonalizedProduct_Guide/Jig, Manufacturing_A...
970    AnatomicalTarget_LowerExtremity_Hip, Anatomica...
971    AnatomicalTarget_Torso, AnatomicalTarget, Manu...
Name: all_tiers_100, Length: 972, dtype: object

In [3]:
training_set['label'] = training_set.all_tiers_100
testing_set['label'] = testing_set.all_tiers_100

In [4]:
training_set.label

0      0
1      0
2      0
3      1
4      0
      ..
967    1
968    0
969    0
970    0
971    0
Name: label, Length: 972, dtype: int64

In [5]:
training_data = Dataset.from_pandas(training_set, split="training")
testing_data = Dataset.from_pandas(testing_set, split="testing")

In [6]:
#model_name = "google/reformer-enwik8"
#model_name = "/var/patentmark/transformer-training/patent-electra-v4"
#model_name = "allenai/scibert_scivocab_uncased"
model_name = ""
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1279.0, style=ProgressStyle(description…




OSError: Can't load tokenizer for 'google/reformer-enwik8'. Make sure that:

- 'google/reformer-enwik8' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'google/reformer-enwik8' is the correct path to a directory containing relevant tokenizer files



In [None]:
#training_data = training_data.map(lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'), batched=True)
training_data = training_data.map(lambda e: tokenizer(e['claims'], truncation=True, max_length=256, padding='max_length'), batched=True)

In [None]:
testing_data = testing_data.map(lambda e: tokenizer(e['claims'], max_length=256, truncation=True, padding='max_length'), batched=True)

In [None]:
training_data.set_format(type='torch')

In [None]:
testing_data.set_format(type='torch')

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.01,
    evaluation_strategy="steps",
    fp16=True,
    eval_steps=100,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=training_data,
    eval_dataset=testing_data
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()