In [1]:
## ---------------------------------------------------------------------
## set up configs for huggingface hub and OS paths on HPC cluster -- make sure config.ini is correct
## ---------------------------------------------------------------------
import configparser
def auth_token():

    config = configparser.ConfigParser()
    config.read("config.ini")
    return config["hugging_face"]["token"]

def scratch_path():
    config = configparser.ConfigParser()
    config.read("config.ini")
    return "/scratch/" + config["user"]["username"]

import os
if os.path.isdir(scratch_path()):
    os.environ['TRANSFORMERS_CACHE'] = scratch_path() + '/.cache/huggingface'
    os.environ['HF_DATASETS_CACHE'] = scratch_path() + '/.cache/huggingface/datasets'
print(os.getenv('TRANSFORMERS_CACHE'))
print(os.getenv('HF_DATASETS_CACHE'))

## ---------------------------------------------------------------------
## Load libraries
## ---------------------------------------------------------------------

import numpy as np
import pandas as pd

import torch
import transformers
from transformers import AutoTokenizer, AutoModel, LlamaForCausalLM, LlamaTokenizer

import torch.nn.functional as F


## ---------------------------------------------------------------------
## Ensure GPU is available -- device should == 'cuda'
## ---------------------------------------------------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device = ", device)

/scratch/dmpowell/.cache/huggingface
/scratch/dmpowell/.cache/huggingface/datasets
device =  cuda


Check out this blog post for some more tutorials running Llama-2:

https://huggingface.co/blog/llama2

In [2]:
## ---------------------------------------------------------------------
## load llama-2 and set up a pipeline
## ---------------------------------------------------------------------

MODEL_NAME = "meta-llama/Llama-2-7b-hf" 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

pipeline = transformers.pipeline(
    "text-generation",
    model = MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    use_auth_token = auth_token()
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [33]:
## ---------------------------------------------------------------------
## interact with Llama-2
## ---------------------------------------------------------------------

sequences = pipeline(
    'Fresh Prince of Bel Air Theme: Lyrics\n\nThis is a story all about how\n',
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


Result: Fresh Prince of Bel Air Theme: Lyrics

This is a story all about how
My life got flip-turned upside-down
And I'll take you back to when it
Happened, oh, back to when it happened.
Back in the days of old, in the 1980s,

I was livin' in the ghetto,

Hangin' out the neighborhood,

Tryin' to get in the clique,

And I knew I had to get up

Get up, get up, get up

I had to get up

Get up, get up, get up

To get my homework done

I'm the Prince and you're the Princess

And I'm here to protect you

From all of the things that could possibly hurt you

And I'm here to guide you

