In [None]:
! CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.24 llama-index==0.9.19 pyngrok flask flask_cors gevent

In [None]:
model_url = "https://huggingface.co/openthaigpt/openthaigpt-1.0.0-beta-13b-chat-gguf/resolve/main/ggml-model-q4_0.gguf"

In [None]:
from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
)
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

In [None]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 43},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,

)

Downloading url https://huggingface.co/openthaigpt/openthaigpt-1.0.0-beta-13b-chat-gguf/resolve/main/ggml-model-q4_0.gguf to path /tmp/llama_index/models/ggml-model-q4_0.gguf
total size (MB): 7430.46


7087it [00:25, 278.48it/s]                          
AVX = 1 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


In [None]:
def generate_stream_sse(instruction,
                    input=None,
                    temperature=0.3,
                    top_p=0.75,
                    top_k=40,
                    beams=1,
                    max_tokens=256,
                    frequency_penalty=0.3,
                    is_streaming=False):

  def generate_chat():
    #Prompt formats
    if input is None:
      prompt = f"""### Instruction:
  {instruction}

  ### Response:
  """
    else:
      prompt = f"""### Instruction:
  {instruction}

  ### Input:
  {input}

  ### Response:
  """

    payload = {
        # "prompt": prompt,
        "stream": is_streaming,
        "temperature": 0 if beams>1 else temperature ,
        "top_p": 1 if beams>1 else top_p,
        "top_k": -1 if beams>1 else top_k,
        "use_beam_search": beams>1,
        "n": beams,
        "stop":'<|endoftext|>',
        "frequency_penalty": frequency_penalty,
        "max_tokens": max_tokens,
        }

    for response in llm.stream_complete(prompt, **payload):
      yield f"data: {response.raw['choices'][0]['text']}\n\n"
  return generate_chat()

In [None]:
import getpass
from pyngrok import ngrok, conf
import nest_asyncio

print("Enter your token: ", end="")
conf.get_default().auth_token = getpass.getpass()

Enter your token: ··········


In [None]:
from flask import Flask, Response, request, abort, jsonify
import time

from flask_cors import CORS
from gevent.pywsgi import WSGIServer
import warnings
import json
import copy

warnings.filterwarnings("ignore", message="specific warning message")

app = Flask(__name__)
CORS(app)
app.config['JSON_AS_ASCII'] = False

@app.route("/completions", methods=["POST"])
def completions():
  datas = request.json
  if not 'instruction' in datas: abort(404)
  if not 'input' in datas: input = None
  else: input = datas['input']
  instruction = datas['instruction']

  resp = Response(
        generate_stream_sse(instruction, input),
        mimetype='text/event-stream'
    )
  resp.headers['X-Accel-Buffering'] = 'no'
  resp.headers['Cache-Control'] = 'no-cache'
  return resp

if __name__ == "__main__":
  ngrok_tunnel = ngrok.connect(5000)
  nest_asyncio.apply()
  print(f"Public URL: {ngrok_tunnel.public_url}")
  http_server = WSGIServer(('0.0.0.0', 5000), app)
  http_server.serve_forever()