<a href="https://colab.research.google.com/github/dsineirobarreiro/ModelER/blob/main/api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Environment configuration

With these commands, the environment is configured to run on GPU. Uncomment cell 3 to use the CPU

In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=61" FORCE_CMAKE=1 pip install llama-cpp-python==0.2.62 --force-reinstall --upgrade --no-cache-dir --verbose --no-build-isolation

In [None]:
!pip install fastapi pydantic-settings uvicorn langchain-experimental jsonref python-multipart ngrok

In [None]:
#!pip install llama-cpp-python

# 2. Download the model
It is necesasary to log in with Hugging Face to download the model

In [None]:
!huggingface-cli download TheBloke/Llama-2-7B-Chat-GGUF llama-2-7b-chat.Q5_K_M.gguf --local-dir ./json/ --local-dir-use-symlinks False

# Model configuration

In [None]:
from langchain_community.llms.llamacpp import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from typing import List, Optional
from langchain.output_parsers import PydanticOutputParser
from langchain.chains.llm import LLMChain
from langchain.memory import ConversationBufferMemory
from langchain_experimental.chat_models import Llama2Chat
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain_core.messages import SystemMessage
from langchain.memory import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from jsonref import replace_refs
import json
from llama_cpp import LlamaGrammar
from typing import Literal

model_path = '/content/json/llama-2-7b-chat.Q5_K_M.gguf'

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

class Attribute(BaseModel):
    name: str = Field(description='attribute name')
    type: str = Field(description='type of the atrtibute')

# Define your desired data structure.
class Entity(BaseModel):
    name: str = Field(description='entity name from the case scenario')
    attributes: Optional[List[Attribute]] = Field(description="colection of attributes from the entity")

class Relation(BaseModel):
    name: str = Field(description='name of the relation')
    source: str = Field(description='source entity of the relation')
    cardinality_of_source: Literal['Zero or One', 'Exactly One', 'Zero or Many', 'One or Many']
    target: str = Field(description='target entity of the relation')
    cardinality_of_target: Literal['Zero or One', 'Exactly One', 'Zero or Many', 'One or Many']


class Main(BaseModel):
    entities: List[Entity] = Field(description='Entity from the case scenario')
    relations: List[Relation] = Field(description='colection of relations from the entities')

def json_schema_with_inlining(model):
    replaced = replace_refs(model.schema(), proxies=False)
    if "$defs" in replaced:
        del replaced["$defs"]
    return json.dumps(replaced)

grammar = LlamaGrammar.from_json_schema(json_schema_with_inlining(Main))

DEFAULT_SYSTEM_PROMPT = """\
You are an expert in data modeling for creating databases. Imagine a client is talking to you in order to create a database for their business.
They will present you a scenario where your goal will be to extract entities, their attributes and their relations so the first step for creating
a database can be achieve. As it is client who asks for this modeling, you must be very accurate and effective in the extraction so work step by step.
Source and target of the relations must be names of entities and every entity must be a participant of some relationship.
"""

llm = LlamaCpp(
    model_path=model_path,
    n_ctx=4096,
    n_gpu_layers=-1,
    max_tokens=-1,
    callback_manager=callback_manager,
    verbose=True,
    streaming=True,
    grammar=grammar,
    temperature=0.2,
)

model = Llama2Chat(llm=llm)

template_messages = [
    SystemMessage(content=DEFAULT_SYSTEM_PROMPT),
    MessagesPlaceholder(variable_name="chat_history"),
    HumanMessagePromptTemplate.from_template("{input}"),
]
prompt_template = ChatPromptTemplate.from_messages(template_messages)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
chain = LLMChain(llm=model, prompt=prompt_template, memory=memory)

chains = {}
chains['llama2'] = {}
chains['generate'] = chain

# API configuration

In [None]:
from fastapi import FastAPI, Form
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware


app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

async def gen(model, action, prompt: str):
    async for event in chain.astream_events(
        {"input": prompt},
        version='v1'
    ):
        if event['event'] == 'on_chat_model_stream':
            yield event['data']['chunk']

@app.post('/{model}/{action}/')
async def generate(action, prompt: str = Form()):
    return StreamingResponse(gen(model, action, prompt), media_type='text/event-stream')

# Ngrok tunnel

In [None]:
import asyncio
import ngrok
import uvicorn
import nest_asyncio

import os
from google.colab import userdata


async def setup():
        ngrok.set_auth_token(userdata.get("NGROK"))
        ngrok.forward(
            8000,
            authtoken_from_env=True,
            #Change the following parameter with your Ngrok domain
            domain="bright-akita-pleasantly.ngrok-free.app"
        )

nest_asyncio.apply()
asyncio.run(setup())
uvicorn.run(app=app)