# Introduction

This notebook is a proof-of-concept for serving [Dolly 2.0 Large Language Model](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) on IPUs, and use  [use it from LangChain](https://python.langchain.com/en/latest/modules/models/llms.html), as a plug-in replacement of any `langchain.llm` objects.


In [None]:
!pip install flask manifest-ml ngrok
!pip install "optimum-graphcore>=0.6.1, <0.7.0"
# For Dolly 2.0
!pip install "examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@latest_stable"
!pip install "git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.2"


## Create an HTTP tunnel

We create an HTTP tunnel to make the served model accessible from a LanghChain environment running on a different machine. This allows fast and east experiments in a local LangChain notebook while delegating compuationally intensive inference tasks to the IPUs. 

The tunnel is created using [ngrok](https://ngrok.com/). You will need to sign up to ngrok, get a free authentication token, and store the token in the environment variable `NGROK_AUTHTOKEN`. 

In [None]:
# Required to start ngrok tunnel in a notebook environment
import nest_asyncio
nest_asyncio.apply()

# ngrok token
import os
# Insert your ngrok authentication token here:
os.environ['NGROK_AUTHTOKEN'] = "Insert your ngrok authentication token here"
import ngrok

# Needed for coroutine's in Notebooks
import asyncio
loop = asyncio.get_event_loop()

tunnel = loop.run_until_complete(ngrok.werkzeug_develop())
# tunnel = await start_tunnel()
print(f"""
Use the following call in your LangChain:

llm = ManifestWrapper(
    client=Manifest(
        client_name="huggingface",
        client_connection="{tunnel.url()}",
    ),
    llm_kwargs={{"client_timeout": 500, "max_tokens": 2048}}
)
""")
    
    


In [None]:
# On ngrok free tier only one active tunne is allowed at one time. This is a problem is the notebook times out (or runs in the background, in a closed page) as ngrok will fail to create a new tunnel.
# I aven't been able to find how to kill the old tunnels.  
# ngrok.disconnect("https://2ba3-38-83-162-251.ngrok-free.app/")

## Pre-compile ML models

We compile and load the LLM into the IPU. This can take up to 10 minutes. Once compiled and loaded, the model be used for inference with minimal latency.

In [None]:
NUMBER_OF_IPUS = 4

# HuggingFace Optimum models
if False:
    # TODO Serve mulitple model
    print("Creating FLAN T5 pipeline")
    from optimum.graphcore import pipeline

    size = {4: "large", 16: "xl"}
    flan_t5_pipeline = pipeline(
        "text2text-generation",
        model=f"google/flan-t5-{size[NUMBER_OF_IPUS]}",
        ipu_config=f"Graphcore/t5-{size[NUMBER_OF_IPUS]}-ipu",
        max_input_length=896,
    )
    print("Pre-compiling T5")
    r = flan_t5_pipeline("precompile this")
    print(r)
    print("T5 pre-compilation done.")


# PopXL models
print("Creating Dolly pipeline")
    
# Dolly 2.0
import sys
sys.path.append('../dolly2-instruction-following')
from utils.setup import dolly_config_setup
import api

sequence_length = 2048  # max 2048
micro_batch_size = 1

config_name = "dolly_pod4" if NUMBER_OF_IPUS == 4 else "dolly_pod16"
config, *_ = dolly_config_setup("../dolly2-instruction-following/config/inference.yml", "release", config_name)

dolly_pipeline = api.DollyPipeline(
    config, sequence_length=sequence_length, micro_batch_size=micro_batch_size
)

print("Pre-compiling Dolly")
r = dolly_pipeline("precompile this")
print(r)
print("Dolly pre-compilation done.")

ML_MODELS_PROVIDED = {
    'ipu-dolly2' : dolly_pipeline
}


In [None]:
"""Flask app."""
import argparse
import io
import json
import logging
import os
from typing import Dict

import pkg_resources
from flask import Flask, Response, request

from manifest.api.response import ModelResponse

logger = logging.getLogger(__name__)
app = Flask(__name__)  # define app using Flask

# Will be global

PORT = int(os.environ.get("FLASK_PORT", 5000))
MODEL_CONSTRUCTORS = {
    # "huggingface": TextGenerationModel,
}
    
@app.route("/completions", methods=["POST"])
def completions() -> Response:
    """Get completions for generation."""
    prompt = request.json["prompt"]
    del request.json["prompt"]
    generation_args = request.json
    
    print("generation args:\n" + str(request.json))

    if not isinstance(prompt, (str, list)):
        raise ValueError("Prompt must be a str or list of str")
    try:
        print(f"** Calling pipeline({prompt})")
        # result_gens = flan_t5(prompt)
        result_gens = dolly_pipeline(prompt, temperature=request.json['temperature'], k=request.json['top_k'])
        print("Results")
        print(f"r = {r}")
        
        results = [
                {"text": r, "logprob": [], "tokens": [], "token_logprobs": []}
                for r in result_gens
            ]
                # {"text": r['generated_text'], "logprob": [], "tokens": [], "token_logprobs": []}
        res_type = "text_completion"
        print("*** SENDING RESPONSE")
        # transform the result into the openai format
        return Response(
            json.dumps(ModelResponse(results, response_type=res_type).__dict__()),
            status=200,
        )
    except Exception as e:
        logger.error(e)
        return Response(
            json.dumps({"message": str(e)}),
            status=400,
        )

@app.route("/params", methods=["POST"])
def params() -> Dict:
    """Get model params."""
    print('/params')
    return {"model_name": "google/flan-t5-large", "model_path": "google/flan-t5-large"}


@app.route("/")
def index() -> str:
    """Get index completion."""
    return "Index page"

print("* Starting app*")
app.run(host="0.0.0.0", port=PORT)




In [None]:
# ngrok.kill()
# tunnel.close()