## ML Paper Tagger & LLM Evaluator

Here what's provided in the notebook:
- Compares zero-shot, few-shot, CoT, and self-consistency
- Evaluates using an LLM evaluator using LangChain

In [1]:
%%capture
# update or install the necessary libraries
!pip install --upgrade openai
!pip install --upgrade langchain
!pip install --upgrade python-dotenv
!pip install pandas

In [2]:
import openai
import os
import IPython
import json
import pandas as pd
from langchain.llms import OpenAI
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain.prompts.example_selector.base import BaseExampleSelector
from langchain.chains import LLMChain
from langchain.evaluation.qa import QAEvalChain
from typing import Dict, List
import numpy as np
from dotenv import load_dotenv
load_dotenv()

True

### Load Data

In [3]:
# set the API key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# create a new LLM
from langchain.llms import OpenAI
llm  = OpenAI()

def print_markdown(text):
    """Prints text as markdown"""
    IPython.display.display(IPython.display.Markdown(text))

# load json data at path: data/article-tags.json into a dataframe
with open('../data/article-tags.json') as f:
    val_data = json.load(f)

with open('../data/few_shot.json') as f:
    few_shot_data = json.load(f)

  warn_deprecated(


### Few-Shot

In [4]:
# customer example selector

class CustomExampleSelector(BaseExampleSelector):
    
    def __init__(self, examples: List[Dict[str, str]]):
        self.examples = examples
    
    def add_example(self, example: Dict[str, str]) -> None:
        """Add new example to store for a key."""
        self.examples.append(example)

    def select_examples(self, size) -> List[dict]:
        """Select which examples to use based on the inputs."""
        return np.random.choice(self.examples, size=size, replace=False)
    

example_selector = CustomExampleSelector(few_shot_data)

template = """
Abstract: {abstract}
Tags: {tags}
"""

prompt = PromptTemplate(
    input_variables=["abstract", "tags"],
    template=template
)

few_shot_prompt = FewShotPromptTemplate(
    examples = list(example_selector.select_examples(3)),
    example_prompt=prompt,
    prefix = "Your task is to extract model names from machine learning paper abstracts. Your response is an an array of the model names in the format [\"model_name\"]. If you don't find model names in the abstract or you are not sure, return [\"NA\"]",
    suffix = "Abstract: {input}\nTags:",
    input_variables=["input"],
    example_separator="\n\n" 

)

### Zero-Shot

In [5]:
# zero-shot prompt
zero_shot_template = """
Your task is extract model names from machine learning paper abstracts. Your response is an an array of the model names in the format [\"model_name\"]. If you don't find model names in the abstract or you are not sure, return [\"NA\"]

Abstract: {abstract}
Tags:
"""

zero_shot_prompt = PromptTemplate(
    input_variables=["abstract"],
    template=zero_shot_template
)

### Self-Consistency with Zero-shot CoT

In [6]:
# call as a batch to get separate responses from the model

# zero-shot prompt
self_consistency_template = """
Your task is extract model names from machine learning paper abstracts delimited by ```. Your response is an an array of the model names in the format [\"model_name\"]. If you don't find model names in the abstract or you are not sure, return [\"NA\"]

Let's think step by step: <steps>
Abstract: ```{abstract}```
Tags (should just output the model names in an array):
"""

self_consistency_prompt = PromptTemplate(
    input_variables=["abstract"],
    template=self_consistency_template
)

### Evaluation Chain

In [7]:
# define chains

zero_chain = LLMChain(llm=llm, prompt=zero_shot_prompt)
few_shot_chain = LLMChain(llm=llm, prompt=few_shot_prompt)
self_consistency_chain = LLMChain(llm=llm, prompt=self_consistency_prompt)

In [8]:
# extract the abstracts from val_data
abstracts = [{"input": val_data[i]["abstract"]} for i in range(len(val_data))]

In [9]:
# few-shot predictions
few_shot_predictions = few_shot_chain.apply(abstracts)

# zero-shot predictions
zero_shot_predictions = zero_chain.apply(val_data)

In [10]:
few_shot_predictions

[{'text': " ['Evol-Instruct', 'LLM', 'WizardLM', 'ChatGPT']"},
 {'text': " ['FLAN-T5', 'LoRA', 'AMR', 'Smatch']"},
 {'text': " ['AI', 'scientific knowledge', 'hypothesis machines']"},
 {'text': " ['PAXQA', 'QG', 'QA']"},
 {'text': " ['ChatGPT']"},
 {'text': " ['ViT', 'OpenCLIP']"},
 {'text': " ['SAM', 'IA', 'AIGC', 'Stable Diffusion']"},
 {'text': " ['Anything-3D', 'BLIP', 'Segment-Anything', 'text-to-image diffusion model', 'neural radiance field']"},
 {'text': " ['Chameleon', 'LLMs', 'GPT-4', 'ScienceQA', 'TabMWP', 'ChatGPT']"},
 {'text': " ['foundation models']"}]

In [11]:
# self-consistency predictions
num_of_samples = 5
len_of_abstracts = len(abstracts)
output_format = """[{"text": '[<model_names>]'}, {"text": '[<model_names>]'}, ...]"""

# number of time to produce tags for each abstract
self_consistency_samples = [self_consistency_chain.apply(val_data) for i in range(num_of_samples)]

def get_model_names(samples):
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
            "role": "system",
            "content": """You are given an array of outputs (delimited by ####) generated by a model. \n\nThe outputs contain a list of model names the model generated for {len_of_abstracts} different abstracts. There are {num_of_samples} different samples in the data.\n\nYour task is to pick the most common and consistent output from the {num_of_samples} different samples, corresponding to each of the 10 different abstracts. \n\nOutput format should be {output_format}""".format(len_of_abstracts=len_of_abstracts, num_of_samples=num_of_samples, output_format=output_format)
            },
            {
            "role": "user",
            "content": """####\n{samples}\n####""".format(samples=self_consistency_samples)
            }
        ],
        temperature=0,
        max_tokens=256,
        frequency_penalty=0,
        presence_penalty=0
    )

    return response.choices[0].message.content

# get the model names
self_consistency_predictions = get_model_names(self_consistency_samples)

print(self_consistency_predictions)

[{"text": '["LLM", "Evol-Instruct", "LLaMA", "WizardLM", "OpenAI ChatGPT"]'}, {"text": '["FLAN-T5", "LoRA"]'}, {"text": '["large language models", "generative AI", "hypothesis machines"]'}, {"text": '["PAXQA", "lexically-constrained machine translation", "question generation (QG) model", "extractive QA models", "zero-shot", "synthetic data generation models"]'}, {"text": '["ChatGPT", "NA", "NA"]'}, {"text": '["ViT", "OpenCLIP"]'}, {"text": '["Segment-Anything Model (SAM)", "Inpaint Anything (IA)", "AIGC models", "Stable Diffusion"]'}, {"text": '["BLIP", "Segment-Anything", "text-to-image diffusion model"]'}, {"text": '["GPT-4", "ChatGPT"]'}, {"text": '["foundation models"]'}]


In [12]:
# Convert self_consistency_predictions to list of objects

final_self_consistency_predictions = eval(self_consistency_predictions)

### LLM Evaluator

In [13]:
# evaluation chain from LangChain (using an LLM to evaluate)

eval_chain = QAEvalChain.from_llm(llm)

zero_shot_graded_outputs = eval_chain.evaluate(val_data, zero_shot_predictions, question_key="abstract", prediction_key="text", answer_key="tags")

fw_graded_ouputs = eval_chain.evaluate(val_data, few_shot_predictions, question_key="abstract", prediction_key="text", answer_key="tags")

self_consistency_graded_outputs = eval_chain.evaluate(val_data, final_self_consistency_predictions, question_key="abstract", prediction_key="text", answer_key="tags")

In [14]:
zero_shot_graded_outputs

[{'results': ' CORRECT'},
 {'results': ' INCORRECT'},
 {'results': ' INCORRECT'},
 {'results': ' CORRECT'},
 {'results': ' CORRECT'},
 {'results': ' CORRECT'},
 {'results': ' CORRECT'},
 {'results': ' CORRECT'},
 {'results': ' CORRECT'},
 {'results': ' CORRECT'}]

In [15]:
zero_shot_correct = 0
fw_correct = 0
self_consistency_correct = 0

for i, eg in enumerate(val_data):
    print(f"Example {i}:")
    #print("Question: " + eg['abstract'])
    print("Real Answer: " + str(eg['tags']))
    print("----------------------")
    print("Zero-shot Predicted Answer: " + zero_shot_predictions[i]['text'])
    print("Zero-shot Predicted Grade: " + zero_shot_graded_outputs[i]['results'])
    print("----------------------")
    print("Few-shot Predicted Answer: " + few_shot_predictions[i]['text'])
    print("Few-shot Predicted Grade: " + fw_graded_ouputs[i]['results'])
    print("----------------------")
    print("Self-consistency Predicted Answer: " + final_self_consistency_predictions[i]['text'])
    print("Self-consistency Predicted Grade: " + self_consistency_graded_outputs[i]['results'])
    print("----------------------")
    print("\n")
    # track the number of "Correct" grades for each model
    if zero_shot_graded_outputs[i]['results'].strip(" ") == "CORRECT":
        zero_shot_correct += 1
    if fw_graded_ouputs[i]['results'].strip(" ") == "CORRECT":
        fw_correct += 1
    if self_consistency_graded_outputs[i]['results'].strip(" ") == "CORRECT":
        self_consistency_correct += 1

print("Zero-shot Accuracy: " + str(zero_shot_correct/len(val_data)))
print("Few-shot Accuracy: " + str(fw_correct/len(val_data)))
print("Self-consistency Accuracy: " + str(self_consistency_correct/len(val_data)))

Example 0:
Real Answer: ['LLaMA', 'ChatGPT', 'WizardLM']
----------------------
Zero-shot Predicted Answer: 
["WizardLM", "LLM", "Evol-Instruct", "LLaMA", "OpenAI ChatGPT"]
Zero-shot Predicted Grade:  CORRECT
----------------------
Few-shot Predicted Answer:  ['Evol-Instruct', 'LLM', 'WizardLM', 'ChatGPT']
Few-shot Predicted Grade:  INCORRECT
----------------------
Self-consistency Predicted Answer: ["LLM", "Evol-Instruct", "LLaMA", "WizardLM", "OpenAI ChatGPT"]
Self-consistency Predicted Grade:  INCORRECT
----------------------


Example 1:
Real Answer: ['FLAN-T5', 'FLAN']
----------------------
Zero-shot Predicted Answer: 
["FLAN", "FLAN-T5", "AMR", "UD", "SRL", "AMR2.0", "AMR3.0", "BioAMR", "LoRA"]
Zero-shot Predicted Grade:  INCORRECT
----------------------
Few-shot Predicted Answer:  ['FLAN-T5', 'LoRA', 'AMR', 'Smatch']
Few-shot Predicted Grade:  CORRECT
----------------------
Self-consistency Predicted Answer: ["FLAN-T5", "LoRA"]
Self-consistency Predicted Grade:  INCORRECT
-----

Todo: LangChain's built-in grading prompt template is not optimal and we have developed a more optimal one for this use case in the course. As an exercise, try to integrate that prompt to the grader. 
