In [2]:
import openai
import os
import IPython
import json
import pandas as pd
import comet_llm
from dotenv import load_dotenv
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate, LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

from langchain.evaluation.qa import QAEvalChain

from langchain.prompts import (
    FewShotChatMessagePromptTemplate,
    ChatPromptTemplate,
)

from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate

load_dotenv()

True

In [3]:
# Comet keys
COMET_API_KEY = os.getenv("COMET_API_KEY")
COMET_WORKSPACE = os.getenv("COMET_WORKSPACE")
COMET_PROJECT_NAME = os.getenv("COMET_PROJECT_NAME")

# set the API key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# create LLM instance
chat = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

In [4]:
# load json data at path: data/article-tags.json into a dataframe
with open('data/article-tags.json') as f:
    val_data = json.load(f)

with open('data/few_shot.json') as f:
    few_shot_data = json.load(f)

In [5]:
# create a custom selector for the few shot data
from langchain.prompts.example_selector.base import BaseExampleSelector
from typing import Dict, List
import numpy as np

class CustomExampleSelector(BaseExampleSelector):
    
    def __init__(self, examples: List[Dict[str, str]]):
        self.examples = examples
    
    def add_example(self, example: Dict[str, str]) -> None:
        """Add new example to store for a key."""
        self.examples.append(example)

    def select_examples(self, size) -> List[dict]:
        """Select which examples to use based on the inputs."""
        return np.random.choice(self.examples, size=size, replace=False)

In [6]:
# few-shot selector
example_selector = CustomExampleSelector(few_shot_data)

In [7]:
# few-shot template
template = """
Abstract: {abstract}
Tags: {tags}
"""

human_template = ChatPromptTemplate.from_messages(
    [
        ("human", "{abstract}"),
        ("ai", "{tags}")
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    examples = list(example_selector.select_examples(3)), # rerun code to generate different examples
    example_prompt=human_template,
)

final_few_shot_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "Your task is to extract model names from machine learning paper abstracts. Your response is an an array of the model names in the format [\"model_name\"]. If you don't find model names in the abstract or you are not sure, return [\"NA\"]"),
        few_shot_prompt,
        ("human", "{abstract}"),
    ]
)

In [8]:
# zero-shot prompt
zero_shot_template = """
Your task is extract model names from machine learning paper abstracts. Your response is an an array of the model names in the format [\"model_name\"]. If you don't find model names in the abstract or you are not sure, return [\"NA\"]

Abstract: {abstract}
Tags:
"""

message_prompt = HumanMessagePromptTemplate.from_template(zero_shot_template)

final_zero_shot_prompt = ChatPromptTemplate.from_messages([message_prompt])

### Prompt Evaluation with QAEval Chain

In [9]:
#zero_chain = LLMChain(llm=llm, prompt=zero_shot_prompt)
few_shot_chain = LLMChain(llm=chat, prompt=final_few_shot_prompt)
zero_shot_chain = LLMChain(llm=chat, prompt=final_zero_shot_prompt)

# run few-shot predictions
fwpredictions = few_shot_chain.apply(val_data)

# run zeroshot predictions
zeroshot_predictions = zero_shot_chain.apply(val_data)

In [17]:
fwpredictions[0]['text']

"['LLM', 'Evol-Instruct', 'LLaMA', 'WizardLM', 'OpenAI ChatGPT']"

In [15]:
# QA Evaluation Chain
llm_choice = "text-davinci-003"

if llm_choice == "text-davinci-003":
    ev_llm  = OpenAI(model_name=llm_choice)
    eval_chain = QAEvalChain.from_llm(ev_llm)
else:
    eval_chain = QAEvalChain.from_llm(chat)

# zero-shot
zero_graded_outputs = eval_chain.evaluate(val_data, zeroshot_predictions, question_key="abstract", prediction_key="text", answer_key="tags")

# few-shot
fw_graded_outputs = eval_chain.evaluate(val_data, fwpredictions, question_key="abstract", prediction_key="text", answer_key="tags")

# model name
model_name = few_shot_chain.to_json()['kwargs']['llm'].model_name

# few-shot prompts
for i, eg in enumerate(val_data):
    comet_llm.log_prompt(
        prompt = final_few_shot_prompt.format(abstract=val_data[i]['abstract']) + "\nAI: ",
        tags = [llm_choice, str(fw_graded_outputs[i]['results']), "few-shot" ],
        metadata = {
            "expected_response": str(val_data[i]['tags']),
            "llm_evaluator_result": str(fw_graded_outputs[i]['results']),
            "llm_evaluator": llm_choice,
            "response_llm": model_name,
        },
        output = fwpredictions[i]['text'],
        api_key = COMET_API_KEY,
        workspace = COMET_WORKSPACE,
        project = COMET_PROJECT_NAME,
    )

# do the same for zero-shot predictions
for i, eg in enumerate(val_data):
    comet_llm.log_prompt(
        prompt = final_zero_shot_prompt.format(abstract=val_data[i]['abstract']) + "\nAI: ",
        tags = [llm_choice, str(zero_graded_outputs[i]['results']), "zero-shot" ],
        metadata = {
            "expected_response": str(val_data[i]['tags']),
            "llm_evaluator_result": str(zero_graded_outputs[i]['results']),
            "llm_evaluator": llm_choice,
            "response_llm": model_name,
        },
        output = zeroshot_predictions[i]['text'],
        api_key = COMET_API_KEY,
        workspace = COMET_WORKSPACE,
        project = COMET_PROJECT_NAME,
    )