In [7]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting responses<0.19
  Using cached responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m1m106.8 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K   

In [68]:
! pip install transformers datasets accelerate nvidia-ml-py3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting nvidia-ml-py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone


Building wheels for collected packages: nvidia-ml-py3
  Building wheel for nvidia-ml-py3 (setup.py) ... [?25ldone
[?25h  Created wheel for nvidia-ml-py3: filename=nvidia_ml_py3-7.352.0-py3-none-any.whl size=19172 sha256=2dada24c4effb8d7a0b6a50b511f62a15a9be4712230a98d7ff2881b1024b0d6
  Stored in directory: /home/dvianna/.cache/pip/wheels/f6/d8/b0/15cfd7805d39250ac29318105f09b1750683387630d68423e1
Successfully built nvidia-ml-py3
Installing collected packages: nvidia-ml-py3
Successfully installed nvidia-ml-py3-7.352.0


In [1]:
import pandas as pd
import torch
import json
from transformers import BloomTokenizerFast, BloomForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from accelerate import Accelerator

In [2]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [3]:
print_gpu_utilization()

GPU memory occupied: 897 MB.


In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [5]:
# Loading bloomz model and tokenizer 
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m").to("cuda")

In [6]:
print_gpu_utilization()

GPU memory occupied: 3684 MB.


In [7]:
dataset = load_dataset("json", data_files="prompts.json")

Found cached dataset json (/home/dvianna/.cache/huggingface/datasets/json/default-5329563cfdcc119a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 395
    })
})

In [9]:
# prepare the data for training
def prepare_train_data(data):
    # prompt + completion
    #text_input = data['prompt'] + ' ' + data['completion']
    text_input = data['text']
    # tokenize the input (prompt + completion) text
    tokenized_input = tokenizer(text_input, return_tensors='pt', padding=True)
    # generative models: labels are the same as the input
    tokenized_input['labels'] = tokenized_input['input_ids']
    return tokenized_input

In [10]:
train_dataset = dataset['train'].map(prepare_train_data, batched=True, remove_columns=["text"])

Loading cached processed dataset at /home/dvianna/.cache/huggingface/datasets/json/default-5329563cfdcc119a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-3a16b40a0956a8e0.arrow


In [12]:
training_arguments = TrainingArguments(
    'LegalQA-bloom-560m',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01,
    fp16=True,
    optim="adafactor",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True
)

In [13]:
trainer = Trainer(
    model = model,
    args = training_arguments,
    train_dataset = train_dataset
)

In [14]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


TrainOutput(global_step=98, training_loss=6.630724225725446, metrics={'train_runtime': 142.2434, 'train_samples_per_second': 2.777, 'train_steps_per_second': 0.689, 'total_flos': 1005412378607616.0, 'train_loss': 6.630724225725446, 'epoch': 0.99})

In [15]:
trainer.save_model()