In [1]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device}...')

Using cuda...


In [2]:
from datasets import load_dataset

fever_plus = load_dataset("tommasobonomo/sem_augmented_fever_nli")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd
pd.DataFrame(fever_plus["train"][0])

Unnamed: 0,id,premise,hypothesis,label,wsd,srl
premise,150448,Roman Atwood . He is best known for his vlogs ...,Roman Atwood is a content creator.,ENTAILMENT,"[{'index': 0, 'text': 'Roman', 'pos': 'ADJ', '...","{'tokens': [{'index': 0, 'rawText': 'Roman'}, ..."
hypothesis,150448,Roman Atwood . He is best known for his vlogs ...,Roman Atwood is a content creator.,ENTAILMENT,"[{'index': 0, 'text': 'Roman', 'pos': 'PROPN',...","{'tokens': [{'index': 0, 'rawText': 'Roman'}, ..."


In [5]:
" ".join([wsd['pos'] for wsd in fever_plus['train'][0]["wsd"]["premise"]])             # create a sentence with the list of all the POS tags sepaated by space

'ADJ PROPN PUNCT PRON AUX ADV VERB ADP PRON NOUN PUNCT SCONJ PRON VERB NOUN ADP PRON NOUN ADP DET ADJ NOUN PUNCT PRON VERB NOUN PUNCT PUNCT PUNCT X PUNCT PUNCT AUX DET NOUN ADP NUM NUM NOUN CCONJ NUM NUM NOUN PUNCT PRON ADV AUX DET PROPN NOUN VERB PUNCT PUNCT PROPN PUNCT PUNCT SCONJ PRON VERB NOUN PUNCT'

In [18]:
# free up CUDA space and optimize space used
torch.cuda.empty_cache()
torch.cuda.set_per_process_memory_fraction(0.95, 0)  # Use 95% of GPU memory
#torch.cuda.set_max_split_size_mb(256)  # Set max split size to 256MB


In [19]:
# combine the elements from a fever entry to prepare a feasible input for 
def preprocess_input(entry, tokenizer):

    # Sample premise and hypothesis with POS tagging
    premise = entry["premise"]
    hypothesis = entry["hypothesis"]

    # Corresponding POS tags (this is just a placeholder for the actual POS tags)
    premise_pos = " ".join([wsd['pos'] for wsd in entry["wsd"]["premise"]])             # create a sentence with the list of all the POS tags sepaated by space
    hypothesis_pos = " ".join([wsd['pos'] for wsd in entry["wsd"]["hypothesis"]])       # 

    # Combine text with POS tags
    premise_with_pos = f"{premise} POS: {premise_pos}"
    hypothesis_with_pos = f"{hypothesis} POS: {hypothesis_pos}"

    # Prepare the combined input for the model
    combined_input = f"[CLS] {premise_with_pos} [SEP] {hypothesis_with_pos}"
    encoding = tokenizer(combined_input, truncation=True, padding='max_length', max_length=512)

    # Map labels as integers representing the classes
    label_map = {'ENTAILMENT': 0, 'CONTRADICTION': 1, 'NEUTRAL': 2}
    int_label = label_map[entry['label']]


    return {**encoding, 'label': int_label}


In [24]:
import torch
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

# Load the DeBERTa tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-small')
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-small', num_labels=3)

# Apply preprocessing to datasets
train_dataset = fever_plus['train'].map(preprocess_input, fn_kwargs={'tokenizer': tokenizer})
val_dataset = fever_plus['validation'].map(preprocess_input, fn_kwargs={'tokenizer': tokenizer})
test_dataset = fever_plus['test'].map(preprocess_input, fn_kwargs={'tokenizer': tokenizer})

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    no_cuda=True  # Use CPU instead of GPU
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1000/1000 [00:03<00:00, 311.20 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 134.47 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 257.40 examples/s]
 17%|█▋        | 500/3000 [11:02<56:06,  1.35s/it]  

{'loss': 1.7255, 'grad_norm': 33.27394104003906, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.5}


 33%|███▎      | 1000/3000 [21:44<40:00,  1.20s/it] 

{'loss': 1.7844, 'grad_norm': 32.05265808105469, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}



 33%|███▎      | 1000/3000 [22:03<40:00,  1.20s/it]

{'eval_loss': 3.4716453552246094, 'eval_runtime': 0.992, 'eval_samples_per_second': 10.081, 'eval_steps_per_second': 10.081, 'epoch': 1.0}


 50%|█████     | 1500/3000 [31:52<29:10,  1.17s/it]  

{'loss': 1.8366, 'grad_norm': 0.47551363706588745, 'learning_rate': 1e-05, 'epoch': 1.5}


 67%|██████▋   | 2000/3000 [41:57<19:19,  1.16s/it]  

{'loss': 1.7805, 'grad_norm': 29.367258071899414, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}



 67%|██████▋   | 2000/3000 [42:14<19:19,  1.16s/it]

{'eval_loss': 3.702300548553467, 'eval_runtime': 0.8616, 'eval_samples_per_second': 11.606, 'eval_steps_per_second': 11.606, 'epoch': 2.0}


 83%|████████▎ | 2500/3000 [52:15<10:55,  1.31s/it]  

{'loss': 1.8569, 'grad_norm': 0.6401755213737488, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.5}


100%|██████████| 3000/3000 [1:03:26<00:00,  1.31s/it]

{'loss': 1.6937, 'grad_norm': 0.7766714692115784, 'learning_rate': 0.0, 'epoch': 3.0}


RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\caffe2\serialize\inline_container.cc:337] . unexpected pos 24448 vs 24340

In [25]:
# Perform a quick test on the test dataset
test_results = trainer.evaluate(test_dataset)

# Print the evaluation results
print("Test Results:", test_results)

# Example of getting predictions on a few test samples
sample_outputs = trainer.predict(test_dataset)
predictions = torch.argmax(torch.tensor(sample_outputs.predictions), dim=1)

# Print predictions and true labels for inspection
print("Predictions:", predictions)
print("True Labels:", test_dataset['label'])



100%|██████████| 3000/3000 [1:07:17<00:00,  1.31s/it]

{'eval_loss': 3.2271573543548584, 'eval_runtime': 10.5041, 'eval_samples_per_second': 9.52, 'eval_steps_per_second': 9.52, 'epoch': 3.0}
Test Results: {'eval_loss': 3.2271573543548584, 'eval_runtime': 10.5041, 'eval_samples_per_second': 9.52, 'eval_steps_per_second': 9.52, 'epoch': 3.0}




Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0])
True Labels: tensor([1, 2, 2, 0, 1, 0, 1, 2, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 2, 0, 2, 0, 1, 2,
        2, 2, 0, 1, 1, 1, 1, 2, 0, 1, 1, 0, 2, 0, 2, 2, 2, 1, 1, 0, 1, 0, 0, 0,
        0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 1, 2, 0, 2, 2, 2, 0, 2, 0, 2, 1, 2, 2, 2,
        1, 1, 2, 1, 0, 0, 1, 2, 2, 2, 1, 0, 0, 2, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
        0, 1, 0, 1])


In [None]:
'''
import torch
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

# Load the DeBERTa tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-base')
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=3)

# Dummy data for demonstration
data = {
    'premise': ['Premise sentence 1', 'Premise sentence 2'],
    'hypothesis': ['Hypothesis sentence 1', 'Hypothesis sentence 2'],
    'label': [0, 1]
}

# Convert data to Dataset objects
train_dataset = Dataset.from_dict(data)
val_dataset = Dataset.from_dict(data)
test_dataset = Dataset.from_dict(data)
  
# Tokenize the dataset  
def preprocess_function(examples):  
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

# Apply preprocessing to datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()
'''

"\nimport torch\nfrom transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments\nfrom datasets import Dataset, load_metric\n\n# Load the DeBERTa tokenizer and model\ntokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-base')\nmodel = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=3)\n\n# Dummy data for demonstration\ndata = {\n    'premise': ['Premise sentence 1', 'Premise sentence 2'],\n    'hypothesis': ['Hypothesis sentence 1', 'Hypothesis sentence 2'],\n    'label': [0, 1]\n}\n\n# Convert data to Dataset objects\ntrain_dataset = Dataset.from_dict(data)\nval_dataset = Dataset.from_dict(data)\ntest_dataset = Dataset.from_dict(data)\n  \n# Tokenize the dataset  \ndef preprocess_function(examples):  \n    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)\n\n# Apply preprocessing to datasets\ntrain_dataset = train_da