<a href="https://colab.research.google.com/github/eduseiti/ia368v_dd_class_04/blob/main/CoQa_via_prompt_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Apply zero-shot and few-shot learning with pretrained Language Models on the [Conversational Question Answering Challenge (CoQA) dataset](https://stanfordnlp.github.io/coqa/)

In [None]:
import requests
import os
import numpy as np
import pandas as pd

import pickle
from google.colab import drive

import json

import time

import re

from datetime import datetime

In [None]:
WORKING_FOLDER="drive/MyDrive/unicamp/ia368v_dd/aula_04"
COQA_DEV_SET="https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
API_ACCESS="API_access_info.json"

COQA_EVALUATION_SCRIPT="https://nlp.stanford.edu/data/coqa/evaluate-v1.0.py"

Connect to Google Drive, as usual

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
os.chdir(WORKING_FOLDER)

Download the CoQa development set

In [None]:
if not os.path.exists(os.path.basename(COQA_DEV_SET)):
    !wget {COQA_DEV_SET}
else:
    print("CoQa development dataset already downloaded...")

CoQa development dataset already downloaded...


Explore the development set

In [None]:
with open(os.path.basename(COQA_DEV_SET), 'r') as inputFile:
    dev_set = json.load(inputFile)

In [None]:
dev_set.keys()

dict_keys(['version', 'data'])

In [None]:
len(dev_set['data'])

500

In [None]:
dev_set['data'][0].keys()

In [None]:
dev_set['data'][0]['questions']

In [None]:
dev_set['data'][0]['answers'][7]

In [None]:
dev_set['data'][0]['story']

Now, create templates for zero-shot and few-shot learning

In [None]:
TASK_PROMPT = "Answer the question and transcribe the text portion supporting your answer\n\n"

ZERO_SHOT_TEMPLATE = "Text: {}\n\nQuestion: {} Transcribe the sentence where you found that answer."
FEW_SHOT_TEMPLATE = "Example text: {}\n\nExample question: {}\nExample answer: {}\nExample transcription: {}\n\n\n\nText: {}\n\nQuestion: {}"
FEW_SHOT_SEQUENCE_TEMPLATE = "Text: {}\n\nQuestion: {}"
FEW_SHOT_SEQUENCE_ADDITIONAL_QUESTION_TEMPLATE = "\nAnswer: {}\nTranscription: {}\n\nQuestion: {}"

In [None]:
FEW_SHOT_QUERY_TYPE="few_shot"
ZERO_SHOT_QUERY_TYPE="zero_shot"

In [None]:
LLAMA_API_DATA_PACKAGE={"prompt": None,
                        "temperature": 0.0,
                        "top_p": 1,
                        "max_length": 100}

OPENAI_API_QUERY_PARAMS={"model": "code-davinci-002",
                         "prompt": None,
                         "temperature": 0,
                         "max_tokens": 100,
                         "top_p": 1,
                         "frequency_penalty": 0,
                         "presence_penalty": 0}

In [None]:
LLAMA_RESPONSE_REGEX="[\n\r][a|A]nswer:(.+)[\n\r].*[t|T]ranscription[s]?:(.+)[\n\r]?"

## Define functions to access the Language Models APIs

In [None]:
def query_llama(test_entry, add_prompt=True, query_type=FEW_SHOT_QUERY_TYPE, example_entry=None):

    test_entry_start_time = time.time()

    llama_responses = []

    if add_prompt:
        prompt_text = TASK_PROMPT
    else:
        prompt_text = ""

    for i in range(len(test_entry['questions'])):
        if query_type == FEW_SHOT_QUERY_TYPE:

            if i == 0:

                #
                # First time the prompt contains an example
                #

                request_prompt = prompt_text + FEW_SHOT_TEMPLATE.format(example_entry['story'], 
                                                                        example_entry['questions'][0]['input_text'],
                                                                        example_entry['answers'][0]['input_text'],
                                                                        example_entry['answers'][0]['span_text'],
                                                                        test_entry['story'],
                                                                        test_entry['questions'][i]['input_text'])
            else:

                #
                # For all the subsequent questions, the prompt will accumulate the answers, as the questions are
                # conversational ― i.e. they build in one another.
                #

                if i == 1:
                    request_prompt = prompt_text + FEW_SHOT_SEQUENCE_TEMPLATE.format(test_entry['story'],
                                                                                     test_entry['questions'][i - 1]['input_text'])

                request_prompt += FEW_SHOT_SEQUENCE_ADDITIONAL_QUESTION_TEMPLATE.format(llama_responses[i - 1]['answer'],
                                                                                        llama_responses[i - 1]['transcription'],
                                                                                        test_entry['questions'][i]['input_text'])
            
            print("--------------------------------------------")
            print("QUESTION #{}".format(i))
            print("--------------------------------------------\n")
            print(request_prompt)

        request_data = LLAMA_API_DATA_PACKAGE
        request_data['prompt'] = request_prompt

        # llama_responses.append({'answer': "answer to question {}".format(i), 
        #                         'transcription': "transcription for question {}".format(i)})

        request_start_time = time.time()

        r = requests.post(f"{access_info['LLAMA_API_ENDPOINT']}/complete", json=request_data)

        if r.ok:
            response=r.json()

            request_uuid=response["request_uuid"]

            ready = False
            while not ready:
                r = requests.get(f"{access_info['LLAMA_API_ENDPOINT']}/get_result/{request_uuid}")
                response = r.json()
                ready = response['ready']
                if ready:
                    print(response['generated_text'])

                    elapsed_time = time.time() - request_start_time

                    print("\n>> Request elapsed time: {}".format(elapsed_time))

                    m = re.match(LLAMA_RESPONSE_REGEX, response['generated_text'])

                    if m is not None:
                        llama_responses.append({'id': test_entry['id'],
                                                'turn_id': test_entry['questions'][i]['turn_id'],
                                                'answer': m.group(1).strip(), 
                                                'transcription': m.group(2).strip()})
                    else:
                        print("No match!!!")

                        for byte in bytes(response['generated_text'], 'utf-8'):
                            print(byte, end=" ")

                    if elapsed_time < 20:
                        print("Wait 10 seconds to avoid getting a 429 error...")

                        time.sleep(10)

                    break

                # Wait 10 seconds before checking again

                time.sleep(10)

            print("\n\n")
        else:
            print("\n\nREQUEST FAILED!!!\n\n")

    print("Elapse total of {:.3f} s to execute all the {} queries".format(test_entry_start_time - time.time(), len(test_entry['questions'])))

    return llama_responses

Define LLAMA test API endpoint

In [None]:
with open(API_ACCESS) as inputFile:
    access_info = json.load(inputFile)

### Select 5 entries to test

Leave the first story as the few-shot example.

In [None]:
entries_to_test = np.random.choice(list(range(1, len(dev_set['data']))), 5, replace=False)

In [None]:
llama_responses = query_llama(dev_set['data'][entries_to_test[0]], example_entry=dev_set['data'][0])

--------------------------------------------
QUESTION #0
--------------------------------------------

Answer the question and transcribe the text portion supporting your answer

Example text: Once upon a time, in a barn near a farm house, there lived a little white kitten named Cotton. Cotton lived high up in a nice warm place above the barn where all of the farmer's horses slept. But Cotton wasn't alone in her little home above the barn, oh no. She shared her hay bed with her mommy and 5 other sisters. All of her sisters were cute and fluffy, like Cotton. But she was the only white one in the bunch. The rest of her sisters were all orange with beautiful white tiger stripes like Cotton's mommy. Being different made Cotton quite sad. She often wished she looked like the rest of her family. So one day, when Cotton found a can of the old farmer's orange paint, she used it to paint herself like them. When her mommy and sisters found her they started laughing. 

"What are you doing, Cotton

In [None]:
test_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

In [None]:
with open("test_{}.json".format(test_timestamp), "w") as outputFile:
    json.dump(llama_responses, outputFile, indent=4)

Download the evaluation script

In [None]:
if not os.path.exists(COQA_EVALUATION_SCRIPT):
    !wget {COQA_EVALUATION_SCRIPT}

--2023-03-20 00:13:21--  https://nlp.stanford.edu/data/coqa/evaluate-v1.0.py
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/coqa/evaluate-v1.0.py [following]
--2023-03-20 00:13:22--  https://downloads.cs.stanford.edu/nlp/data/coqa/evaluate-v1.0.py
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10564 (10K) [application/octet-stream]
Saving to: ‘evaluate-v1.0.py.1’


2023-03-20 00:13:22 (8.77 MB/s) - ‘evaluate-v1.0.py.1’ saved [10564/10564]



### Now execute the evaluation script for the executed tests

First, create a reference dataset containing only the tested queries

In [None]:
reference_dataset = {"version": 1.0,
                     "data": [dev_set['data'][entries_to_test[0]]]}

In [None]:
REFERENCE_DATASET="reference_dataset.json"

In [None]:
with open(REFERENCE_DATASET, "w") as outputFile:
    json.dump(reference_dataset, outputFile, indent=4)

In [None]:
EXECUTED_TESTS_FILEAME="test_{}.json".format(test_timestamp)

In [None]:
!python evaluate-v1.0.py --data-file {REFERENCE_DATASET} --pred-file {EXECUTED_TESTS_FILEAME} --human

{
  "children_stories": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "literature": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "mid-high_school": {
    "em": 91.2,
    "f1": 95.7,
    "turns": 20
  },
  "news": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "wikipedia": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "reddit": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "science": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "in_domain": {
    "em": 91.2,
    "f1": 95.7,
    "turns": 20
  },
  "out_domain": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "overall": {
    "em": 91.2,
    "f1": 95.7,
    "turns": 20
  }
}
{
  "children_stories": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "literature": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "mid-high_school": {
    "em": 65.0,
    "f1": 70.8,
    "turns": 20
  },
  "news": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "wikipedia": {
    "em": 0.0,
 