In [None]:
# !pip install --upgrade --quiet  langchain langchain-openai faiss-cpu tiktoken datasets

In [None]:
import sys
import os
current_dir = os.getcwd()

sys.path.append(os.path.dirname(current_dir))

from utils import setup_api_key

setup_api_key(file_path='../../config.json')

In [None]:
import os
from datasets import load_from_disk
from tqdm import tqdm

from operator import itemgetter

from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from typing import List
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
import json

def get_model_id(model_type, run_name, project_name, checkpoint_id):
    return os.path.join(model_type, "model_output", project_name, run_name, checkpoint_id)

project_config = {
    "survey-json": {
        "project_name": "survey-json-model-inst",
        "train_dataset_path": "../datasets/survey_json_datasets_instruction_train",
        "test_dataset_path": "../datasets/survey_json_datasets_instruction_test",
    },
    "schema": {
        "project_name": "schema-model-inst",
        "train_dataset_path": "../datasets/schema_datasets/schema_data_train",
        "test_dataset_path": "../datasets/schema_datasets/schema_data_test"
    },
    "paraloq": {
        "project_name": "paraloq-model-inst",
        "train_dataset_path": "../datasets/paraloq/paraloq_data_train",
        "test_dataset_path": "../datasets/paraloq/paraloq_data_test"
    },
    "nous": {
        "project_name": "nous-model-inst",
        "train_dataset_path": "../datasets/nous/nous_data_train",
        "test_dataset_path": "../datasets/nous/nous_data_test"
    }
}

def load_project(project="schema"):
    test_dataset = load_from_disk(project_config[project]["test_dataset_path"])
    train_dataset = load_from_disk(project_config[project]["train_dataset_path"])
    return test_dataset, train_dataset

def run(project = "schema"):
    test_dataset, train_dataset = load_project(project=project)
    document_size = 10

    examples = []
    for data in train_dataset:
        instruction, response = data["text"].split("[/INST]")
        instruction = instruction.replace("<s>[INST]", "").strip()
        document_size -= 1
        examples.append({"instruction": instruction, "response": response})
        if document_size == 0:
            break

    task = ""
    if project == "schema":
        task = "Convert the raw data to ld+json format."
    elif project == "survey-json":
        task = "Convert the question list to survey json."
    elif project == "paraloq":
        task = "Generate the structured response for the given query."
    elif project == "nous":
        task = "Generate the structured response for the given query."

    documents = []
    for i, example in enumerate(examples):
        documents.append(f"Example {i+1}:\nInstruction: {example['instruction']}\nResponse: {example['response']}\n\n")

    test_data = []

    for data in tqdm(test_dataset):
        instruction, response = data["text"].split("[/INST]")
        instruction = instruction.replace("<s>[INST]", "").strip()
        prompt = f"{task}\nInstruction: {instruction}\nResponse:"
        test_data.append({"prompt": prompt, "response": response})
        
    vectorstore = FAISS.from_texts(
        documents, embedding=OpenAIEmbeddings()
    )

    retriever = vectorstore.as_retriever()

    template = """Response based on the following context:
    {context}

    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template)

    chain_model = ChatOpenAI(temperature=0)

    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | chain_model
        | StrOutputParser()
    )

    generated_responses = []
    actual_responses = []

    for data in tqdm(test_data):
        response = chain.invoke(data['prompt'])
        generated_responses.append(response)
        actual_responses.append(data['response'].strip())

    export_date = {
        "generated_responses": generated_responses,
        "actual_responses": actual_responses
    }

    # write to json file
    with open(f'./{project}_instruction_generation.json', 'w') as f:
        json.dump(export_date, f)

    # load json file 
    with open(f'./{project}_instruction_generation.json', 'r') as f:
        data = json.load(f)
    return data

In [None]:
run_list = ["schema", "paraloq", "nous"]
for project in run_list:
    run(project=project)

In [None]:
chain_model = ChatOpenAI(temperature=0)
class Metric(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=Metric)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | chain_model | parser
for pred, ground_truth in zip(preds, ground_truths):
    metric_query = f""
    chain.invoke({"query": metric_query})