In [24]:
import pandas as pd
import datasets
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments, TextClassificationPipeline
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import wandb
import os
import json

os.environ["TOKENIZERS_PARALLELISM"] = "false"
basedir = os.path.abspath('.')

In [2]:
file = 'roberta_data.csv'
dataset = datasets.load_dataset("csv", data_files=file)

Using custom data configuration default-c3a4ebf2c9a3143f


Downloading and preparing dataset csv/default to /Users/christian/.cache/huggingface/datasets/csv/default-c3a4ebf2c9a3143f/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /Users/christian/.cache/huggingface/datasets/csv/default-c3a4ebf2c9a3143f/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
data = pd.read_csv('roberta_data.csv')

In [4]:
data.loc[data.label==1,:]

Unnamed: 0,text,label
25000,"Missense mutations in TUBB3, the gene that enc...",1
25001,"For resource-constrained IoT systems, data col...",1
25002,Objectives\r\nPrevious attempts at meta-analys...,1
25003,Increasing attention has been paid to the role...,1
25004,OBJECTIVES: The severe forms of hypertriglycer...,1
...,...,...
49995,Strong-gravitational lens systems with quadrup...,1
49996,The terminal complement-inhibitor eculizumab h...,1
49997,"Electrical winding faults, namely stator short...",1
49998,The advent of the clinically approved drug cis...,1


In [5]:
split_dataset = dataset['train'].train_test_split(test_size=0.2)
train_data = split_dataset['train']
test_data = split_dataset['test']

In [6]:
# load model and tokenizer and define length of the text sequence
model = RobertaForSequenceClassification.from_pretrained('roberta-base',  num_labels=2)
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [7]:
# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = True, truncation=True)


train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir = basedir + '/results',
    num_train_epochs = 3,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size = 8,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end = True,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_steps = 8,
    logging_dir = basedir + '/results',
    dataloader_num_workers = 8,
    run_name = 'roberta-classification'
)

In [9]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    train_dataset=train_data,
    eval_dataset=test_data,
    model=model,
    args=training_args,
    compute_metrics=compute_metrics
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 1875
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mcrodriguezmer[0m ([33mology_ml[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3561,0.362228,0.8471,0.848749,0.831718,0.866492
2,0.2937,0.339273,0.8625,0.870393,0.816013,0.932539
3,0.1877,0.348097,0.8656,0.870943,0.83013,0.915977


wandb: Network error (ReadTimeout), entering retry loop.
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
Saving model checkpoint to /Users/christian/Documents/openalex-snapshot/results/checkpoint-625
Configuration saved in /Users/christian/Documents/openalex-snapshot/results/checkpoint-625/config.json
Model weights saved in /Users/christian/Documents/openalex-snapshot/results/checkpoint-625/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num 

TrainOutput(global_step=1875, training_loss=0.3177709925969442, metrics={'train_runtime': 128769.8853, 'train_samples_per_second': 0.932, 'train_steps_per_second': 0.015, 'total_flos': 3.15733266432e+16, 'train_loss': 0.3177709925969442, 'epoch': 3.0})

In [12]:
os.getcwd()

'/Users/christian/Documents/openalex-snapshot'

In [13]:
save_directory = "./save_pretrained"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

tokenizer config file saved in ./save_pretrained/tokenizer_config.json
Special tokens file saved in ./save_pretrained/special_tokens_map.json
Configuration saved in ./save_pretrained/config.json
Model weights saved in ./save_pretrained/pytorch_model.bin


In [44]:
pt_model = RobertaForSequenceClassification.from_pretrained(save_directory)
pt_tokenizer = RobertaTokenizerFast.from_pretrained(save_directory)

loading configuration file ./save_pretrained/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.21.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file ./save_pretrained/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSe

In [45]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=2)

In [63]:
for abstract in test_data['text'][:10]:
    print(abstract)
    print(pipe(abstract))
    print('\n')
    


Flow microcalorimetric titrations of calmodulin with melittin at 25 degrees C revealed that the formation of the high-affinity one-to-one complex in the presence of Ca2+ (Comte, M., Maulet, Y., and Cox, J. A. (1983) Biochem, J. 209, 269-272) is entirely entropy driven (delta H0 = 30.3 kJ X mol-1; delta S0 = 275 J X K-1 X mol-1). Neither the proton nor the Mg2+ concentrations have any significant effect on the strength of the complex. In the absence of Ca2+, a nonspecific calmodulin-(melittin)n complex is formed; the latter is predominantly entropy driven, accompanied by a significant uptake of protons and fully antagonized by Mg2+. Enthalpy titrations of metal-free calmodulin with Ca2+ in the presence of an equimolar amount of melittin were carried out at pH 7.0 in two buffers of different protonation enthalpy. The enthalpy and proton release profiles indicate that: protons, absorbed by the nonspecific calmodulin-melittin complex, are released upon binding of the first Ca2+; Ca2+ bindi