In [1]:
!pip install transformers torch evaluate accelerate optuna numpy datasets scikit-learn

[0m

In [2]:
import sys
sys.version

'3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]'

In [3]:
!pip install --upgrade huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

[0m

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, LlamaForCausalLM
import torch
import evaluate
import accelerate
import transformers
import numpy as np
import optuna
import random
from datasets import load_dataset

metric = evaluate.load("accuracy")

In [5]:
# Set device to cuda/mps/cpu
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = 'cpu'
device

'cuda'

In [6]:
def get_dataset(path:str):
    '''Loads dataset from HuggingFace, returns a single shard
    '''
    dataset = load_dataset(path)

    return dataset

    

def tokenize(dataset, tokenizer:AutoTokenizer, example_index:str):
    '''tokenize, pads, and truncates dataset object'''
    dataset = dataset.map(lambda examples: tokenizer(examples[example_index],
                                                   return_tensors="pt",
                                                   padding=True, truncation=True),
                        batched=True).with_format("torch")
    return dataset

def split_dataset(dataset, train_size:float, test_size:float, eval_size:float):
    train_set, test_set, eval_set = torch.utils.data.random_split(dataset, [train_size, test_size, eval_size])
    return train_set, test_set, eval_set

def model_init():
    model = "meta-llama/Llama-2-7b-chat-hf"
    tokenizer = AutoTokenizer.from_pretrained(model)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"


dataset = get_dataset('dliu1/legal-llama-raw-text')
print(dataset)

dataset = dataset['train'].train_test_split(test_size=0.2, shuffle=True)
#dataset = dataset.rename_column("text", "label")
print(dataset)

train_tokenized_dataset = tokenize(dataset['train'], tokenizer=tokenizer, example_index='label')
test_tokenized_dataset = tokenize(dataset['test'], tokenizer=tokenizer, example_index='label')

print(train_tokenized_dataset)
print(test_tokenized_dataset)

print(train_tokenized_dataset[2]) #prints tokenized tensor of one entry



#print(len(train_tokenized_dataset))
#print(len(test_tokenized_dataset))
print('tokenized text')

model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
model.resize_token_embeddings(len(tokenizer))

args = TrainingArguments(
        f"{model_name}-RE_Llama",
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=4,
        weight_decay=0.01
)

trainer = Trainer(
    model = model,
    args=args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 33568
    })
})
DatasetDict({
    train: Dataset({
        features: ['label'],
        num_rows: 26854
    })
    test: Dataset({
        features: ['label'],
        num_rows: 6714
    })
})


Map:   0%|          | 0/26854 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/6714 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 26854
})
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 6714
})
{'label': '    11.  Nothing  contained  in this section, section six-h of the banking', 'input_ids': tensor([   0,    0,    0,  ...,  278, 9124,  292]), 'attention_mask': tensor([0, 0, 0,  ..., 1, 1, 1])}
tokenized text


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [9]:
dataset = get_dataset('dliu1/legal-llama-raw-text')
print(dataset)

dataset = dataset['train'].train_test_split(test_size=0.2, shuffle=True)
dataset = dataset.rename_column("text", "label")
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 33568
    })
})


DatasetDict({
    train: Dataset({
        features: ['label'],
        num_rows: 26854
    })
    test: Dataset({
        features: ['label'],
        num_rows: 6714
    })
})

In [7]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer.add_special_tokens({'pad_token': 'eos_token'})
tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
model.resize_token_embeddings(len(tokenizer))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(32000, 4096)

In [8]:
dataset = get_dataset('dliu1/legal-llama-raw-text')
print(dataset)

dataset = dataset['train'].train_test_split(test_size=0.2, shuffle=True)

train_tokenized_dataset = tokenize(dataset['train'], tokenizer=tokenizer, example_index='text')
test_tokenized_dataset = tokenize(dataset['test'], tokenizer=tokenizer, example_index='text')

train_tokenized_dataset
test_tokenized_dataset

train_tokenized_dataset[2] #prints tokenized tensor of one entry


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 33568
    })
})


Map:   0%|          | 0/26854 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/6714 [00:00<?, ? examples/s]

{'text': '    (a)  where  real  property is subdivided into not more than four lots,',
 'input_ids': tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,  

In [9]:
data = {'input_ids': torch.tensor(train_tokenized_dataset[2]['input_ids']).unsqueeze(0).to(device), 'attention_mask': torch.tensor(train_tokenized_dataset[2]['attention_mask']).unsqueeze(0).to(device)} 

  data = {'input_ids': torch.tensor(train_tokenized_dataset[2]['input_ids']).unsqueeze(0).to(device), 'attention_mask': torch.tensor(train_tokenized_dataset[2]['attention_mask']).unsqueeze(0).to(device)}


In [10]:
model(**data)


SequenceClassifierOutputWithPast(loss=None, logits=tensor([[1.6748, 4.6992]], device='cuda:0', dtype=torch.float16,
       grad_fn=<IndexBackward0>), past_key_values=((tensor([[[[-1.4102e-04,  2.4486e-04, -2.0623e-04,  ..., -6.6102e-05,
            4.2140e-05, -4.9889e-05],
          [-3.7527e-04,  2.4295e-04, -2.7418e-06,  ..., -6.6102e-05,
            4.2140e-05, -4.9889e-05],
          [-2.6464e-04,  6.9857e-05,  2.0242e-04,  ..., -6.6102e-05,
            4.2140e-05, -4.9949e-05],
          ...,
          [-2.5146e-01, -2.4084e-01,  1.1182e-01,  ...,  4.0088e-01,
           -1.1469e-01,  5.0928e-01],
          [ 4.0710e-02,  2.1875e-01, -3.7354e-01,  ...,  1.6064e-01,
            1.9202e-01,  2.5537e-01],
          [ 1.1670e+00, -3.1689e-01, -1.0107e+00,  ..., -3.4180e-01,
            8.3313e-02, -3.1177e-01]],

         [[-3.4785e-04, -2.2709e-04,  1.8418e-04,  ..., -1.6153e-05,
           -2.4652e-04,  2.7776e-05],
          [-1.3518e-04, -1.5116e-04,  2.7180e-05,  ..., -1.6153e-0