Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Gradio Web Interface for LLaMA #126

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions virtualenv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
rm -rf llama_env
python3 -m venv llama_env
source llama_env/bin/activate

python -m pip install --upgrade pip

python -m pip install wheel
python setup.py bdist_wheel

pip install -r requirements.txt
pip install -e .

python -m pip install gradio

# run webapp.sh

97 changes: 97 additions & 0 deletions webapp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import os
import sys
import torch
import fire
import time
import json

import gradio as gr

from typing import Tuple
from pathlib import Path
from fairscale.nn.model_parallel.initialize import initialize_model_parallel
from llama import ModelArgs, Transformer, Tokenizer, LLaMA


ckpt_dir = "models/7B"
tokenizer_path = "models/tokenizer.model"
temperature = 0.8
top_p = 0.95
max_seq_len = 512
max_batch_size = 32


def setup_model_parallel() -> Tuple[int, int]:
local_rank = int(os.environ.get("LOCAL_RANK", -1))
world_size = int(os.environ.get("WORLD_SIZE", -1))

torch.distributed.init_process_group("nccl")
initialize_model_parallel(world_size)
torch.cuda.set_device(local_rank)

# seed must be the same in all processes
torch.manual_seed(1)
return local_rank, world_size


def load(
ckpt_dir: str,
tokenizer_path: str,
local_rank: int,
world_size: int,
max_seq_len: int,
max_batch_size: int,
) -> LLaMA:
start_time = time.time()
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
assert world_size == len(
checkpoints
), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
ckpt_path = checkpoints[local_rank]
print("Loading")
checkpoint = torch.load(ckpt_path, map_location="cpu")
with open(Path(ckpt_dir) / "params.json", "r") as f:
params = json.loads(f.read())

model_args: ModelArgs = ModelArgs(
max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
)
tokenizer = Tokenizer(model_path=tokenizer_path)
model_args.vocab_size = tokenizer.n_words
torch.set_default_tensor_type(torch.cuda.HalfTensor)
model = Transformer(model_args)
torch.set_default_tensor_type(torch.FloatTensor)
model.load_state_dict(checkpoint, strict=False)

generator = LLaMA(model, tokenizer)
print(f"Loaded in {time.time() - start_time:.2f} seconds")
return generator


local_rank, world_size = setup_model_parallel()
if local_rank > 0:
sys.stdout = open(os.devnull, "w")

generator = load(
ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
)


def process(prompt: str):
print("Received:\n", prompt)
prompts = [prompt]
results = generator.generate(
prompts, max_gen_len=256, temperature=temperature, top_p=top_p
)
print("Generated:\n", results[0])
return str(results[0])


demo = gr.Interface(
fn = process,
inputs = gr.Textbox(lines=10, placeholder="Your prompt here..."),
outputs = "text",
)

# To create a public link, set `share=True` in `launch()`.
demo.launch(share=True)
9 changes: 9 additions & 0 deletions webapp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#
# first build the virtualenv using the virtualenv.sh script
#
# gradio webapp.py
torchrun --nproc_per_node $MP webapp.py
#
# or use CUDA_VISIBLE_DEVICES if you want to target a specific gpu device
# CUDA_VISIBLE_DEVICES=1 torchrun --nproc_per_node $MP webapp.py
#