In [1]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device}...')

Using cuda...


In [2]:
from datasets import load_dataset

fever_plus = load_dataset("tommasobonomo/sem_augmented_fever_nli")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd
pd.DataFrame(fever_plus["train"][0])

Unnamed: 0,id,premise,hypothesis,label,wsd,srl
premise,150448,Roman Atwood . He is best known for his vlogs ...,Roman Atwood is a content creator.,ENTAILMENT,"[{'index': 0, 'text': 'Roman', 'pos': 'ADJ', '...","{'tokens': [{'index': 0, 'rawText': 'Roman'}, ..."
hypothesis,150448,Roman Atwood . He is best known for his vlogs ...,Roman Atwood is a content creator.,ENTAILMENT,"[{'index': 0, 'text': 'Roman', 'pos': 'PROPN',...","{'tokens': [{'index': 0, 'rawText': 'Roman'}, ..."


In [20]:
# free up CUDA space
torch.cuda.empty_cache()

In [8]:
" ".join([wsd['pos'] for wsd in fever_plus['train'][0]["wsd"]["premise"]])             # create a sentence with the list of all the POS tags sepaated by space

'ADJ PROPN PUNCT PRON AUX ADV VERB ADP PRON NOUN PUNCT SCONJ PRON VERB NOUN ADP PRON NOUN ADP DET ADJ NOUN PUNCT PRON VERB NOUN PUNCT PUNCT PUNCT X PUNCT PUNCT AUX DET NOUN ADP NUM NUM NOUN CCONJ NUM NUM NOUN PUNCT PRON ADV AUX DET PROPN NOUN VERB PUNCT PUNCT PROPN PUNCT PUNCT SCONJ PRON VERB NOUN PUNCT'

In [19]:
# combine the elements from a fever entry to prepare a feasible input for 
def preprocess_input(entry, tokenizer):

    # Sample premise and hypothesis with POS tagging
    premise = entry["premise"]
    hypothesis = entry["hypothesis"]

    # Corresponding POS tags (this is just a placeholder for the actual POS tags)
    premise_pos = " ".join([wsd['pos'] for wsd in entry["wsd"]["premise"]])             # create a sentence with the list of all the POS tags sepaated by space
    hypothesis_pos = " ".join([wsd['pos'] for wsd in entry["wsd"]["hypothesis"]])       # 

    # Combine text with POS tags
    premise_with_pos = f"{premise} POS: {premise_pos}"
    hypothesis_with_pos = f"{hypothesis} POS: {hypothesis_pos}"

    # Prepare the combined input for the model
    combined_input = f"[CLS] {premise_with_pos} [SEP] {hypothesis_with_pos}"

    return tokenizer(combined_input, truncation=True, padding='max_length', max_length=512)


In [20]:
import torch
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

# Load the DeBERTa tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-base')
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=3)

# Apply preprocessing to datasets
train_dataset = fever_plus['train'].map(preprocess_input, fn_kwargs={'tokenizer': tokenizer})
val_dataset = fever_plus['validation'].map(preprocess_input, fn_kwargs={'tokenizer': tokenizer})
test_dataset = fever_plus['test'].map(preprocess_input, fn_kwargs={'tokenizer': tokenizer})

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 51086/51086 [02:50<00:00, 299.16 examples/s]
Map: 100%|██████████| 2288/2288 [00:07<00:00, 306.58 examples/s]
Map: 100%|██████████| 2287/2287 [00:07<00:00, 309.78 examples/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 2.00 GiB total capacity; 1.40 GiB already allocated; 0 bytes free; 1.51 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
'''
import torch
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

# Load the DeBERTa tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-base')
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=3)

# Dummy data for demonstration
data = {
    'premise': ['Premise sentence 1', 'Premise sentence 2'],
    'hypothesis': ['Hypothesis sentence 1', 'Hypothesis sentence 2'],
    'label': [0, 1]
}

# Convert data to Dataset objects
train_dataset = Dataset.from_dict(data)
val_dataset = Dataset.from_dict(data)
test_dataset = Dataset.from_dict(data)
  
# Tokenize the dataset  
def preprocess_function(examples):  
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

# Apply preprocessing to datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()
'''