In [None]:
# !pip install -q datasets transformers evaluate peft trl bitsandbytes accelerate
# !pip install --upgrade -q accelerate
# !pip install -q python-Levenshtein
# !pip install -q langchain langchain-openai
# !pip install --upgrade --quiet  langchain langchain-openai faiss-cpu tiktoken

In [None]:
import sys
import os
current_dir = os.getcwd()

sys.path.append(os.path.dirname(current_dir))

from utils import setup_api_key

setup_api_key(file_path='../../config.json')

In [None]:
import os
from datasets import load_from_disk
from tqdm import tqdm

            
from typing import List
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
import json

def get_model_id(model_type, run_name, project_name, checkpoint_id):
    return os.path.join(model_type, "model_output", project_name, run_name, checkpoint_id)

project_config = {
    "survey-json": {
        "project_name": "survey-json-model-inst",
        "train_dataset_path": "../datasets/survey_json_datasets_instruction_train",
        "test_dataset_path": "../datasets/survey_json_datasets_instruction_test",
    },
    "schema": {
        "project_name": "schema-model-inst",
        "train_dataset_path": "../datasets/schema_datasets/schema_data_train",
        "test_dataset_path": "../datasets/schema_datasets/schema_data_test"
    },
    "paraloq": {
        "project_name": "paraloq-model-inst",
        "train_dataset_path": "../datasets/paraloq/paraloq_data_train",
        "test_dataset_path": "../datasets/paraloq/paraloq_data_test"
    },
    "nous": {
        "project_name": "nous-model-inst",
        "train_dataset_path": "../datasets/nous/nous_data_train",
        "test_dataset_path": "../datasets/nous/nous_data_test"
    }
}

def load_project(project="schema"):
    test_dataset = load_from_disk(project_config[project]["test_dataset_path"])
    train_dataset = load_from_disk(project_config[project]["train_dataset_path"])
    return test_dataset, train_dataset

In [None]:
from utils import json_score, extract_json, content_score
from sklearn.metrics import f1_score, recall_score

run_list = ["schema", "paraloq", "nous"]
for project in run_list:
    print(f"Running for project: {project}")
    # get_result(project=project)

    # load json file
    with open(f"./{project}_instruction_generation.json", "r") as f:
        data = json.load(f)
    overall_score = 0
    overall_key_score = 0
    overall_value_score = 0
    foramt_error_count = 0
    json_array_count = 0
    n = len(data["generated_responses"])
    y_true = [True] * n
    y_pred = []
    for i in range(n):
        try:
            generated_response = data["generated_responses"][i]
            actual_response = data["actual_responses"][i]
            generated_json, format_error = extract_json(
                generated_response, project=project
            )
            actual_json, _ = extract_json(actual_response, project=project)
            if format_error["format_error"]:
                foramt_error_count += 1
            scores = json_score(generated_json, actual_json)
            if scores["key"] >= 0.99 and scores["value"] >= 0.8:
                y_pred.append(True)
            else:
                y_pred.append(False)
            overall_score += scores["overall"]
            overall_key_score += scores["key"]
            overall_value_score += scores["value"]
            if scores["json_array"]:
                json_array_count += 1
                
            # get content score
            content_scores = content_score(generated_response, actual_response)
            print(f"Content score: {content_scores}")
            break
        except Exception as e:
            raise e

    avg_score = overall_score / n
    f1 = f1_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    print(f"Average score: {avg_score}")
    print(f"Average key score: {overall_key_score / n}")
    print(f"Average value score: {overall_value_score / n}")
    print(f"Count of format errors: {foramt_error_count}")
    print(f"F1 Score: {f1}")
    print(f"Recall: {recall}")
    print("\n")

In [None]:
# chain_model = ChatOpenAI(temperature=0)
# class Metric(BaseModel):
#     setup: str = Field(description="question to set up a joke")
#     punchline: str = Field(description="answer to resolve the joke")

# # Set up a parser + inject instructions into the prompt template.
# parser = JsonOutputParser(pydantic_object=Metric)

# prompt = PromptTemplate(
#     template="Answer the user query.\n{format_instructions}\n{query}\n",
#     input_variables=["query"],
#     partial_variables={"format_instructions": parser.get_format_instructions()},
# )

# chain = prompt | chain_model | parser

# metric_query = f"tell me a joke"
# chain.invoke({"query": metric_query})