In [1]:
!pip install -U adapter-transformers
!pip install datasets

Collecting adapter-transformers
  Downloading adapter_transformers-3.0.0-py3-none-any.whl (3.9 MB)
[K     |████████████████████████████████| 3.9 MB 13.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 58.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 54.7 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 33.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, adapter-transformers
  Attempting uninst

In [2]:
!git clone https://github.com/google-research-datasets/circa.git

Cloning into 'circa'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 36 (delta 10), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (36/36), done.


In [3]:
##### All Imports #########
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn import preprocessing
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback, AutoModelWithHeads
from transformers import TrainingArguments, Trainer

In [4]:
############ Read data #########################
df = pd.read_csv("/content/circa/circa-data.tsv", sep='\t', keep_default_na=False)
contexts = []
for index, row in df.iterrows():
  if row['context'] not in contexts:
    contexts.append(row['context'])
contexts
############ Prepare train and test data #############
d = {}
d_train = {}
d_test = {}
frames_train = [] 
frames_test = []
for i in range(0, len(contexts)):
    d[i] = df[df.context == contexts[i]]
for i in range(0, len(contexts)):
    d_train[i], d_test[i] = train_test_split(d[i], test_size=0.2)
for i in range(0, len(contexts)):
  frames_train.append(d_train[i])
  frames_test.append(d_test[i])
train_df = pd.concat(frames_train)
test_df = pd.concat(frames_test)
matched_labels = ["Yes", "No", "Yes, subject to some conditions", "In the middle, neither yes nor no", "Other", "NA"]

In [5]:
tokenizer = None
def preprocess_single_param(data_set, input_param, label_param):
  X = list(data_set[input_param])
  y = list(data_set[label_param])
  
  le = preprocessing.LabelEncoder()
  le.fit(matched_labels)

  X_train, X_val, y_train, y_val = train_test_split(X, list(le.transform(y)), test_size=0.1)
  X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
  X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

  return X_train_tokenized, y_train, X_val_tokenized, y_val

In [6]:
tokenizer = None
def preprocess_double_param(data_set, input_param1, input_param2, label_param):
  separator = [';']*data_set[input_param1].size
  X = list(data_set[input_param1]+separator+data_set[input_param2])
  y = list(data_set[label_param])
  
  le = preprocessing.LabelEncoder()
  le.fit(matched_labels)

  X_train, X_val, y_train, y_val = train_test_split(X, list(le.transform(y)), test_size=0.1)
  X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
  X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

  return X_train_tokenized, y_train, X_val_tokenized, y_val

In [7]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

def create_dataset(X_train_tokenized, y_train, X_val_tokenized, y_val):
  train_dataset = Dataset(X_train_tokenized, y_train)
  val_dataset = Dataset(X_val_tokenized, y_val)

  return train_dataset, val_dataset  

In [9]:
######## Training the model ############
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    # recall = recall_score(y_true=labels, y_pred=pred)
    # precision = precision_score(y_true=labels, y_pred=pred)
    # f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy}
    #  "precision": precision, "recall": recall, "f1": f1}

def training_arg(learn_rate, eval_strat, eval_steps, train_batch_size, test_batch_size, epochs, seed):
  args = TrainingArguments(
    output_dir="output",
    learning_rate=learn_rate,
    evaluation_strategy=eval_strat,
    eval_steps=eval_steps,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=test_batch_size,
    num_train_epochs=epochs,
    seed=seed,
    load_best_model_at_end=True,)
  return args

def create_trainer(model, training_arg, train_dataset, val_dataset):
  trainer = Trainer(
      model=model,
      args=training_arg,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      compute_metrics=compute_metrics,
      callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
  )

  trainer.train()

In [10]:
######## Prepare test data #############
def prepare_test_single_input_param(test_data, input_param, label_param):
  X_test = list(test_data[input_param])
  y_test = list(test_data[label_param])
  X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
  

  return X_test_tokenized, y_test

In [11]:
######## Prepare test data #############
def prepare_test_double_input_param(test_data, input_param1, input_param2, label_param):
  separator = [';']*test_data[input_param1].size
  X_test = list(test_data[input_param1]+separator+test_data[input_param2])
  y_test = list(test_data[label_param])
  X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
  

  return X_test_tokenized, y_test

In [12]:
############# Predict ######################
# Load test data
def create_test_predict(model, X_test_tokenized, y_test):
  # Create torch dataset
  test_dataset = Dataset(X_test_tokenized)
  # Define test trainer
  test_trainer = Trainer(model)
  # Make prediction
  raw_pred, _, _ = test_trainer.predict(test_dataset)
  # Preprocess raw predictions
  y_pred = np.argmax(raw_pred, axis=1)
  le = preprocessing.LabelEncoder()
  le.fit(matched_labels)
  y_true = list(le.transform(y_test))
  # calculate accuracy
  print(accuracy_score(y_true, y_pred))

In [None]:
!rm -rf "output"

In [16]:
######### BERT-YN (Answer only) ##################
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)
X_train_tokenized, y_train, X_val_tokenized, y_val = preprocess_single_param(train_df, "answer-Y", "goldstandard2")
train_dataset, val_dataset = create_dataset(X_train_tokenized, y_train, X_val_tokenized, y_val)
args = training_arg(5e-5, "steps", 500, 8, 8, 3, 0)
create_trainer(model, args, train_dataset, val_dataset)

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7

Step,Training Loss,Validation Loss,Accuracy
500,0.8785,0.745735,0.754194
1000,0.7733,0.736555,0.76477
1500,0.7546,0.704862,0.765864
2000,0.7153,0.73689,0.761488
2500,0.7425,0.695455,0.769511
3000,0.6957,0.663915,0.774617
3500,0.5963,0.748345,0.772794
4000,0.5948,0.724392,0.781911
4500,0.5967,0.692693,0.787381


***** Running Evaluation *****
  Num examples = 2742
  Batch size = 8
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 8
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 8
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 8
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Nu

In [17]:
X_test_tokenized, y_test = prepare_test_single_input_param(test_df, "answer-Y", "goldstandard2")
model = BertForSequenceClassification.from_pretrained("output/checkpoint-4500", num_labels=6)
create_test_predict(model, X_test_tokenized, y_test)

loading configuration file output/checkpoint-4500/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "adapters": {
    "adapters": {},
    "config_map": {},
    "fusion_config_map": {},
    "fusions": {}
  },
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "ab

0.7819746244713431


In [None]:
!rm -rf "output"

In [None]:
######### BERT-YN (Question only) ##################
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)
X_train_tokenized, y_train, X_val_tokenized, y_val = preprocess_single_param(train_df, "question-X", "goldstandard2")
train_dataset, val_dataset = create_dataset(X_train_tokenized, y_train, X_val_tokenized, y_val)
args = training_arg(5e-5, "steps", 500, 32, 32, 3, 0)
create_trainer(model, args, train_dataset, val_dataset)
X_test_tokenized, y_test = prepare_test_single_input_param(test_df, "question-X", "goldstandard2")
model = BertForSequenceClassification.from_pretrained("output/checkpoint-2000", num_labels=6)
create_test_predict(model, X_test_tokenized, y_test)

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7

Step,Training Loss,Validation Loss,Accuracy
500,1.078,1.007798,0.547411
1000,1.0042,0.995862,0.555069
1500,0.9809,1.003206,0.562728
2000,0.9337,1.011435,0.555799


***** Running Evaluation *****
  Num examples = 2742
  Batch size = 32
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 32
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 32
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 32
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin


Training completed. Do not fo

0.5591366486801809


In [None]:
!rm -rf "output"

In [None]:
######### BERT-YN (Question and Answer) ##################
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)
X_train_tokenized, y_train, X_val_tokenized, y_val = preprocess_double_param(train_df, "question-X","answer-Y", "goldstandard2")
train_dataset, val_dataset = create_dataset(X_train_tokenized, y_train, X_val_tokenized, y_val)
args = training_arg(5e-5, "steps", 500, 32, 32, 3, 0)
create_trainer(model, args, train_dataset, val_dataset)
X_test_tokenized, y_test = prepare_test_double_input_param(test_df, "question-X","answer-Y","goldstandard2")
model = BertForSequenceClassification.from_pretrained("output/checkpoint-2000", num_labels=6)
create_test_predict(model, X_test_tokenized, y_test)

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7

Step,Training Loss,Validation Loss,Accuracy
500,0.7006,0.524464,0.829686
1000,0.4612,0.479658,0.852298
1500,0.3627,0.458061,0.864333
2000,0.2378,0.51672,0.86798


***** Running Evaluation *****
  Num examples = 2742
  Batch size = 32
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 32
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 32
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 32
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin


Training completed. Do not fo

0.8581012104418843


In [19]:
!rm -rf "output"

In [13]:
######### BERT-MNLI-YN (Question and Answer) ##################

num_labels = 6
model_name = "roberta-base"
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device))
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
adapter_name = model.load_adapter("AdapterHub/roberta-base-pf-mnli", source="hf")
model.active_adapters = adapter_name
X_train_tokenized, y_train, X_val_tokenized, y_val = preprocess_double_param(train_df, "question-X","answer-Y", "goldstandard2")
train_dataset, val_dataset = create_dataset(X_train_tokenized, y_train, X_val_tokenized, y_val)
args = training_arg(5e-5, "steps", 500, 16, 16, 3, 0)
create_trainer(model, args, train_dataset, val_dataset)
X_test_tokenized, y_test = prepare_test_double_input_param(test_df, "question-X","answer-Y","goldstandard2")
model_test = (AutoModelForSequenceClassification.from_pretrained("output/checkpoint-4500", num_labels=num_labels).to(device))
model_test.active_adapters = adapter_name
create_test_predict(model, X_test_tokenized, y_test)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/575 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/410 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.59M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

Model class 'RobertaModelWithHeads' of found prediction head does not match current model class.
***** Running training *****
  Num examples = 24669
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4626


Step,Training Loss,Validation Loss,Accuracy
500,0.6955,0.566405,0.828957
1000,0.5741,0.564975,0.847192
1500,0.4984,0.494051,0.856309
2000,0.4127,0.478836,0.86725
2500,0.4061,0.492602,0.876003
3000,0.3545,0.44674,0.878556
3500,0.2863,0.52759,0.875638
4000,0.2664,0.480017,0.880744
4500,0.2528,0.466209,0.89205


***** Running Evaluation *****
  Num examples = 2742
  Batch size = 16
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 16
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 16
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 16
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****


0.8874143211316903


In [None]:
!rm -rf "output"

In [None]:
######### BERT-BOOLQ-YN (Question and Answer) ##################

num_labels = 6
model_name = "roberta-base"
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device))
tokenizer = AutoTokenizer.from_pretrained(model_name)
adapter_name = model.load_adapter("AdapterHub/roberta-base-pf-boolq", source="hf")
model.active_adapters = adapter_name
X_train_tokenized, y_train, X_val_tokenized, y_val = preprocess_double_param(train_df, "question-X","answer-Y", "goldstandard2")
train_dataset, val_dataset = create_dataset(X_train_tokenized, y_train, X_val_tokenized, y_val)
args = training_arg(5e-5, "steps", 500, 8, 8, 3, 0)
create_trainer(model, args, train_dataset, val_dataset)
X_test_tokenized, y_test = prepare_test_double_input_param(test_df, "question-X","answer-Y","goldstandard2")
model = (AutoModelForSequenceClassification.from_pretrained("output/checkpoint-4500", num_labels=num_labels).to(device))
create_test_predict(model, X_test_tokenized, y_test)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.24k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/582 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.60M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

Model class 'RobertaModelWithHeads' of found prediction head does not match current model class.
***** Running training *****
  Num examples = 24669
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9252


Step,Training Loss,Validation Loss,Accuracy
500,0.844,0.729164,0.768417
1000,0.6742,0.715692,0.792487
1500,0.646,0.71942,0.796864
2000,0.6252,0.70355,0.816193
2500,0.6202,0.643053,0.823851
3000,0.5782,0.608941,0.830416
3500,0.5383,0.607382,0.851933
4000,0.5048,0.665094,0.858497
4500,0.5175,0.653336,0.848286
5000,0.468,0.574031,0.86725


***** Running Evaluation *****
  Num examples = 2742
  Batch size = 8
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 8
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 8
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2742
  Batch size = 8
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Nu

0.863497156190754
