In [1]:
!pip install transformers torch evaluate accelerate optuna numpy datasets scikit-learn

[0m

In [2]:
!pip install --upgrade huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

[0m

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, LlamaForCausalLM
import torch
import evaluate
import accelerate
import transformers
import numpy as np
import optuna
import random
from datasets import load_dataset

metric = evaluate.load("accuracy")

In [4]:
# Set device to cuda/mps/cpu
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = 'cpu'
device

'cuda'

In [5]:
def get_dataset(path:str):
    '''Loads dataset from HuggingFace, returns a single shard
    '''
    dataset = load_dataset(path)

    return dataset

    

def tokenize(dataset, tokenizer:AutoTokenizer, example_index:str):
    '''tokenize, pads, and truncates dataset object'''
    dataset = dataset.map(lambda examples: tokenizer(examples[example_index],
                                                   return_tensors="pt",
                                                   padding=True, truncation=True),
                        batched=True).with_format("torch")
    return dataset

def split_dataset(dataset, train_size:float, test_size:float, eval_size:float):
    train_set, test_set, eval_set = torch.utils.data.random_split(dataset, [train_size, test_size, eval_size])
    return train_set, test_set, eval_set

def model_init():
    model = "meta-llama/Llama-2-7b-chat-hf"
    tokenizer = AutoTokenizer.from_pretrained(model)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def new_column(example):
    example["labels"] = example["input_ids"]
    return example

In [6]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"


dataset = get_dataset('dliu1/legal-llama-raw-text')
print(dataset)

dataset = dataset['train'].train_test_split(test_size=0.2, shuffle=True)

#dataset = dataset.rename_column("text", "label")
print(dataset)

train_tokenized_dataset = tokenize(dataset['train'], tokenizer=tokenizer, example_index='text')
#train_tokenized_dataset = train_tokenized_dataset.add_column("labels", train_tokenized_dataset['input_ids'].copy())
train_tokenized_dataset = train_tokenized_dataset.map(new_column)

test_tokenized_dataset = tokenize(dataset['test'], tokenizer=tokenizer, example_index='text')
#test_tokenized_dataset = test_tokenized_dataset.add_column("labels", test_tokenized_dataset['input_ids'].copy())
test_tokenized_dataset = test_tokenized_dataset.map(new_column)

#new_column = train_tokenized_dataset['input_ids'].copy()
#train_tokenized_dataset = train_tokenized_dataset.add_column("labels", new_column)

#new_column = test_tokenized_dataset['input_ids'].copy()
#test_tokenized_dataset = test_tokenized_dataset.add_column("labels", new_column)

print(train_tokenized_dataset)
print(test_tokenized_dataset)

print(train_tokenized_dataset[2]) #prints tokenized tensor of one entry

print('tokenized text')

model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
model.resize_token_embeddings(len(tokenizer))

args = TrainingArguments(
        f"{model_name}-RE_Llama",
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=4,
        weight_decay=0.01
)

trainer = Trainer(
    model = model,
    args=args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 33568
    })
})
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 26854
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6714
    })
})


Map:   0%|          | 0/26854 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/26854 [00:00<?, ? examples/s]

Map:   0%|          | 0/6714 [00:00<?, ? examples/s]

Map:   0%|          | 0/6714 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 26854
})
Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 6714
})
{'text': '  business of claiming, demanding,  charging,  receiving,  collecting,  or', 'input_ids': tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
   

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 350.00 MiB. GPU 0 has a total capacty of 47.54 GiB of which 196.00 MiB is free. Process 3737520 has 47.34 GiB memory in use. Of the allocated memory 45.64 GiB is allocated by PyTorch, and 549.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF