<a href="https://colab.research.google.com/github/eduseiti/ia368v_dd_class_04/blob/main/CoQa_via_prompt_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Apply zero-shot and few-shot learning with pretrained Language Models on the [Conversational Question Answering Challenge (CoQA) dataset](https://stanfordnlp.github.io/coqa/)

In [19]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.2-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 KB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 KB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multidict<7.0,>=4.5
  Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [20]:
import requests
import os
import numpy as np
import pandas as pd

import pickle
from google.colab import drive

import json

import time

import re

from datetime import datetime

import openai

In [2]:
WORKING_FOLDER="drive/MyDrive/unicamp/ia368v_dd/aula_04"
COQA_DEV_SET="https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
API_ACCESS="API_access_info.json"

COQA_EVALUATION_SCRIPT="https://nlp.stanford.edu/data/coqa/evaluate-v1.0.py"

Connect to Google Drive, as usual

In [3]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
os.chdir(WORKING_FOLDER)

Download the CoQa development set

In [6]:
if not os.path.exists(os.path.basename(COQA_DEV_SET)):
    !wget {COQA_DEV_SET}
else:
    print("CoQa development dataset already downloaded...")

CoQa development dataset already downloaded...


Read and explore the development set

In [5]:
with open(os.path.basename(COQA_DEV_SET), 'r') as inputFile:
    dev_set = json.load(inputFile)

In [7]:
dev_set.keys()

dict_keys(['version', 'data'])

In [8]:
len(dev_set['data'])

500

Download the evaluation script

In [9]:
if not os.path.exists(os.path.basename(COQA_EVALUATION_SCRIPT)):
    !wget {COQA_EVALUATION_SCRIPT}
else:
    print("Evaluation script already downloaded...")

Evaluation script already downloaded...


Now, create templates for zero-shot and few-shot learning

In [10]:
TASK_PROMPT = "Read the text, answer the questions and transcribe the text portion supporting your answer:\n\n"
TASK_PROMPT_NO_TRANSCRIPTION = "Read the text and answer the questions:\n\n"


ZERO_SHOT_FIRST_QUESTION_TEMPLATE="Text: {}\n\nQuestion: {} Answer the question and transcribe the sentence where you found it."
ZERO_SHOT_NEXT_QUESTIONS_TEMPLATE="\nAnswer: {}\nTranscription: {}\n\nQuestion: {}"

ZERO_SHOT_FIRST_QUESTION_TEMPLATE_NO_TRANSCRIPTION="Text: {}\n\nQuestion: {}"
ZERO_SHOT_NEXT_QUESTIONS_TEMPLATE_NO_TRANSCRIPTION="\nAnswer: {}\n\nQuestion: {}"


FEW_SHOT_TEMPLATE="Example text: {}\n\nExample question: {}\nExample answer: {}\nExample transcription: {}\n\n\n\nText: {}\n\nQuestion: {}"
FEW_SHOT_SEQUENCE_TEMPLATE="Text: {}\n\nQuestion: {}"
FEW_SHOT_SEQUENCE_ADDITIONAL_QUESTION_TEMPLATE = "\nAnswer: {}\nTranscription: {}\n\nQuestion: {}"

FEW_SHOT_TEMPLATE_NO_TRANSCRIPTION="Example text: {}\n\nExample question: {}\nExample answer: {}\n\n\n\nText: {}\n\nQuestion: {}"
FEW_SHOT_SEQUENCE_ADDITIONAL_QUESTION_TEMPLATE_NO_TRANSCRIPTION = "\nAnswer: {}\n\nQuestion: {}"

In [11]:
FEW_SHOT_QUERY_TYPE="few_shot"
ZERO_SHOT_QUERY_TYPE="zero_shot"

Results filename format:   

```
    test_<llama|text-davinci-003|code-davinci-002>_<few_shot|zero_shot>_<YYYYMMDD_HHMMSS>.json
```



In [12]:
TEST_RESULTS_FILENAME_FORMAT="test_{}_{}_{}.json"

In [13]:
LLAMA_API_DATA_PACKAGE={"prompt": None,
                        "temperature": 0.0,
                        "top_p": 1,
                        "max_length": 100}

OPENAI_API_QUERY_PARAMS={"model": None,
                         "prompt": None,
                         "temperature": 0,
                         "max_tokens": 50,
                         "top_p": 1,
                         "frequency_penalty": 0,
                         "presence_penalty": 0}7

In [14]:
LLAMA_RESPONSE_REGEX=".*[\n\r]*[a|A]nswer:(.+)[\n\r].*[t|T]ranscription[s]?:(.+)[\n\r]?"
LLAMA_RESPONSE_EMBEDDED_TRANSCRIPTION=".*[\n\r]*[a|A]nswer:(.+)[\.](.+)[\n\r]?"
LLAMA_RESPONSE_NO_TRANSCRIPTION_REGEX=".*[\n\r]*[a|A]nswer:(.+)[\n\r]?"

In [43]:
OPENAI_RESPONSE_REGEX=".*[\n\r]*[a|A]nswer:(.+)[\n\r].*[t|T]ranscription[s]?:(.+)[\n\r]?"
OPENAI_RESPONSE_EMBEDDED_TRANSCRIPTION=".*[\n\r]*[a|A]nswer:(.+)[\.](.+)[\n\r]?"
OPENAI_RESPONSE_NO_TRANSCRIPTION_REGEX=".*[\n\r]*[a|A]nswer:(.+)[\n\r]?"

## Define functions to access the Language Models APIs

In [24]:
def build_request_prompt(query_type, i, prompt_text, request_prompt, example_entry, test_entry, current_responses, ask_transcription):

    if query_type == FEW_SHOT_QUERY_TYPE:
        if i == 0:
            #
            # First time the prompt contains an example
            #

            if ask_transcription:
                request_prompt = prompt_text + FEW_SHOT_TEMPLATE.format(example_entry['story'], 
                                                                        example_entry['questions'][0]['input_text'],
                                                                        example_entry['answers'][0]['input_text'],
                                                                        example_entry['answers'][0]['span_text'],
                                                                        test_entry['story'],
                                                                        test_entry['questions'][i]['input_text'])
            else:
                request_prompt = prompt_text + FEW_SHOT_TEMPLATE_NO_TRANSCRIPTION.format(example_entry['story'], 
                                                                                         example_entry['questions'][0]['input_text'],
                                                                                         example_entry['answers'][0]['input_text'],
                                                                                         test_entry['story'],
                                                                                         test_entry['questions'][i]['input_text'])
        else:
            #
            # For all the subsequent questions, the prompt will accumulate the answers, as the questions are
            # conversational ― i.e. they build in one another.
            #

            if i == 1:
                request_prompt = prompt_text + FEW_SHOT_SEQUENCE_TEMPLATE.format(test_entry['story'],
                                                                                    test_entry['questions'][i - 1]['input_text'])

            if ask_transcription:
                request_prompt += FEW_SHOT_SEQUENCE_ADDITIONAL_QUESTION_TEMPLATE.format(current_responses[i - 1]['answer'],
                                                                                        current_responses[i - 1]['transcription'],
                                                                                        test_entry['questions'][i]['input_text'])
            else:
                request_prompt += FEW_SHOT_SEQUENCE_ADDITIONAL_QUESTION_TEMPLATE_NO_TRANSCRIPTION.format(current_responses[i - 1]['answer'],
                                                                                                         test_entry['questions'][i]['input_text'])
    elif query_type == ZERO_SHOT_QUERY_TYPE:
        if i == 0:
            #
            # First time only contains the text and the question
            #

            if ask_transcription:
                request_prompt = prompt_text + ZERO_SHOT_FIRST_QUESTION_TEMPLATE.format(test_entry['story'],
                                                                                        test_entry['questions'][i]['input_text'])
            else:
                request_prompt = prompt_text + ZERO_SHOT_FIRST_QUESTION_TEMPLATE_NO_TRANSCRIPTION.format(test_entry['story'],
                                                                                                         test_entry['questions'][i]['input_text'])
        else:
            #
            # For all the subsequent questions, the prompt will accumulate the answers, as the questions are
            # conversational ― i.e. they build in one another.
            #

            if ask_transcription:
                request_prompt += ZERO_SHOT_NEXT_QUESTIONS_TEMPLATE.format(current_responses[i - 1]['answer'],
                                                                           current_responses[i - 1]['transcription'],
                                                                           test_entry['questions'][i]['input_text'])
            else:
                request_prompt += ZERO_SHOT_NEXT_QUESTIONS_TEMPLATE_NO_TRANSCRIPTION.format(current_responses[i - 1]['answer'],
                                                                                            test_entry['questions'][i]['input_text'])


    return request_prompt

In [50]:
def query_openai(test_entry, which_model='code-davinci-002', add_prompt=True, query_type=FEW_SHOT_QUERY_TYPE, example_entry=None, ask_transcription=False):

    test_entry_start_time = time.time()

    openai_responses = []

    if add_prompt:
        if ask_transcription:
            prompt_text = TASK_PROMPT
        else:
            prompt_text = TASK_PROMPT_NO_TRANSCRIPTION
    else:
        prompt_text = ""

    request_prompt = ""

    openai.api_key = access_info["OPENAI_API_KEY"]

    for i in range(len(test_entry['questions'])):

        request_prompt = build_request_prompt(query_type, i, prompt_text, request_prompt, example_entry, test_entry, openai_responses, ask_transcription)

            
        print("--------------------------------------------")
        print("QUESTION #{}".format(i))
        print("--------------------------------------------\n")
        print(request_prompt)

        request_params = OPENAI_API_QUERY_PARAMS
        request_params['prompt'] = request_prompt
        request_params['model'] = which_model

        request_start_time = time.time()
      
        response = openai.Completion.create(**request_params)

        print(response['choices'][0]['text'])

        elapsed_time = time.time() - request_start_time

        print("\n>> Request elapsed time: {:.3f}".format(elapsed_time))


        if ask_transcription:
            m = re.match(OPENAI_RESPONSE_REGEX, response['choices'][0]['text'])
        else:
            m = re.match(OPENAI_RESPONSE_NO_TRANSCRIPTION_REGEX, response['choices'][0]['text'])

        if m is None:
            print("Try another match...")

            m = re.match(OPENAI_RESPONSE_EMBEDDED_TRANSCRIPTION, response['choices'][0]['text'])

        if m is not None:
            answer_text = m.group(1).strip()
            transcription_text = ""

            if ask_transcription and (len(m.groups()) > 1):
                transcription_text = m.group(2).strip()


            openai_responses.append({'id': test_entry['id'],
                                     'turn_id': test_entry['questions'][i]['turn_id'],
                                     'answer': answer_text, 
                                     'transcription': transcription_text})
        else:
            print("No match!!!")

            for byte in bytes(response['generated_text'], 'utf-8'):
                print(byte, end=" ")

            #
            # Add empty response to avoid breaking the treatment.
            #

            openai_responses.append({'id': test_entry['id'],
                                     'turn_id': test_entry['questions'][i]['turn_id'],
                                     'answer': "", 
                                     'transcription': ""})
                        
    print("Elapse total of {:.3f} s to execute all the {} queries".format(time.time() - test_entry_start_time, len(test_entry['questions'])))

    return openai_responses

In [16]:
def query_llama(test_entry, add_prompt=True, query_type=FEW_SHOT_QUERY_TYPE, example_entry=None, ask_transcription=False):

    test_entry_start_time = time.time()

    llama_responses = []

    if add_prompt:
        if ask_transcription:
            prompt_text = TASK_PROMPT
        else:
            prompt_text = TASK_PROMPT_NO_TRANSCRIPTION
    else:
        prompt_text = ""

    request_prompt = ""

    for i in range(len(test_entry['questions'])):

        request_prompt = build_request_prompt(query_type, i, prompt_text, request_prompt, example_entry, test_entry, llama_responses, ask_transcription)

            
        print("--------------------------------------------")
        print("QUESTION #{}".format(i))
        print("--------------------------------------------\n")
        print(request_prompt)

        request_data = LLAMA_API_DATA_PACKAGE
        request_data['prompt'] = request_prompt

        request_start_time = time.time()

        r = requests.post(f"{access_info['LLAMA_API_ENDPOINT']}/complete", json=request_data)

        if r.ok:
            response=r.json()

            request_uuid=response["request_uuid"]

            ready = False
            while not ready:
                r = requests.get(f"{access_info['LLAMA_API_ENDPOINT']}/get_result/{request_uuid}")
                response = r.json()
                ready = response['ready']
                if ready:
                    print(response['generated_text'])

                    elapsed_time = time.time() - request_start_time

                    print("\n>> Request elapsed time: {:.3f}".format(elapsed_time))

                    if ask_transcription:
                        m = re.match(LLAMA_RESPONSE_REGEX, response['generated_text'])
                    else:
                        m = re.match(LLAMA_RESPONSE_NO_TRANSCRIPTION_REGEX, response['generated_text'])

                    if m is None:
                        print("Try another match...")

                        m = re.match(LLAMA_RESPONSE_EMBEDDED_TRANSCRIPTION, response['generated_text'])

                    if m is not None:
                        answer_text = m.group(1).strip()
                        transcription_text = ""

                        if ask_transcription and (len(m.groups()) > 1):
                            transcription_text = m.group(2).strip()


                        llama_responses.append({'id': test_entry['id'],
                                                'turn_id': test_entry['questions'][i]['turn_id'],
                                                'answer': answer_text, 
                                                'transcription': transcription_text})
                    else:
                        print("No match!!!")

                        for byte in bytes(response['generated_text'], 'utf-8'):
                            print(byte, end=" ")

                        #
                        # Add empty response to avoid breaking the treatment.
                        #

                        llama_responses.append({'id': test_entry['id'],
                                                'turn_id': test_entry['questions'][i]['turn_id'],
                                                'answer': "", 
                                                'transcription': ""})
                        

                    if elapsed_time < 20:
                        print("Wait 10 seconds to avoid getting a 429 error...")

                        time.sleep(10)

                    break

                # Wait 10 seconds before checking again

                time.sleep(10)

            print("\n\n")
        else:
            print("\n\nREQUEST FAILED!!!\n\n")

    print("Elapse total of {:.3f} s to execute all the {} queries".format(time.time() - test_entry_start_time, len(test_entry['questions'])))

    return llama_responses

In [54]:
def execute_test(test_set_filename, test_set_data, selected_entries, llm="llama", test_parameters={'example_entry': None,
                                                                                                   'query_type': FEW_SHOT_QUERY_TYPE,
                                                                                                   'add_prompt': True,
                                                                                                   'ask_transcription': False}):
    
    test_start_time = time.time()

    test_responses = []

    test_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_filename = TEST_RESULTS_FILENAME_FORMAT.format(llm, test_parameters['query_type'], test_timestamp)

    executed_test = {'timestamp': test_timestamp,
                     'set': test_set_filename,
                     'set_entries': [int(a) for a in selected_entries],
                     'configuration': test_parameters,
                     'answers': None}

    if llm == "llama":
        for test_entry in [test_set_data['data'][i] for i in selected_entries]:
            test_responses += query_llama(test_entry, **test_parameters)

            # Save the results so far just to make sure they are not lost...

            executed_test['answers'] = test_responses

            # print(executed_test)

            with open(results_filename, "w") as outputFile:
                json.dump(executed_test, outputFile, indent=4)

    elif llm == "openai":
        for test_entry in [test_set_data['data'][i] for i in selected_entries]:
            test_responses += query_openai(test_entry, **test_parameters)

            # Save the results so far just to make sure they are not lost...

            executed_test['answers'] = test_responses

            # print(executed_test)

            with open(results_filename, "w") as outputFile:
                json.dump(executed_test, outputFile, indent=4)


    print("Total elapsed time: {}".format(time.time() - test_start_time))

    return results_filename

Define LLAMA test API endpoint

In [26]:
with open(API_ACCESS) as inputFile:
    access_info = json.load(inputFile)

### Select 5 entries to test

Randomly choose 5 entries, leaving the first story as the few-shot example.

In [19]:
entries_to_test = np.random.choice(list(range(1, len(dev_set['data']))), 5, replace=False)

Optionally uses a predefined sample set to reproduce an already used test set

In [27]:
entries_to_test = [219, 352, 272, 132, 82]

Create a reference dataset containing only the tested queries

In [20]:
reference_dataset_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

In [26]:
reference_dataset = {"version": 1.0,
                     "data": [dev_set['data'][i] for i in entries_to_test]}

In [27]:
REFERENCE_DATASET="reference_dataset_{}.json".format(reference_dataset_timestamp)

In [28]:
with open(REFERENCE_DATASET, "w") as outputFile:
    json.dump(reference_dataset, outputFile, indent=4)

In [52]:
test_results_files = []

## Execute the test sequence for OpenAI model

### Execute only the tests using few-shot setup with prompt

In [56]:
test_parameters={'which_model': 'text-davinci-003',
                 'example_entry': dev_set['data'][0],
                 'query_type': FEW_SHOT_QUERY_TYPE,
                 'add_prompt': True,
                 'ask_transcription': False}

test_results_files.append(execute_test(os.path.basename(COQA_DEV_SET), dev_set, entries_to_test, llm='openai', test_parameters=test_parameters))

--------------------------------------------
QUESTION #0
--------------------------------------------

Read the text and answer the questions:

Example text: Once upon a time, in a barn near a farm house, there lived a little white kitten named Cotton. Cotton lived high up in a nice warm place above the barn where all of the farmer's horses slept. But Cotton wasn't alone in her little home above the barn, oh no. She shared her hay bed with her mommy and 5 other sisters. All of her sisters were cute and fluffy, like Cotton. But she was the only white one in the bunch. The rest of her sisters were all orange with beautiful white tiger stripes like Cotton's mommy. Being different made Cotton quite sad. She often wished she looked like the rest of her family. So one day, when Cotton found a can of the old farmer's orange paint, she used it to paint herself like them. When her mommy and sisters found her they started laughing. 

"What are you doing, Cotton?!" 

"I only wanted to be more lik

## Now execute the test sequence for LLaMA

### Execute the tests using few-shot setup with prompt

In [35]:
test_parameters={'example_entry': dev_set['data'][0],
                 'query_type': FEW_SHOT_QUERY_TYPE,
                 'add_prompt': True,
                 'ask_transcription': False}

test_results_files.append(execute_test(os.path.basename(COQA_DEV_SET), dev_set, entries_to_test, test_parameters=test_parameters))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Read the text and answer the questions:

Text: It was a cold winter day. A woman drove up to the Rainbow Bridge tollbooth . "I'm paying for myself, and for the six cars behind me," she said with a smile, handing over seven tickets. One after another, the next six drivers arriving at the tollbooth were informed, "Some lady up ahead already paid your fare." 

It turned out that the woman, Natalie Smith, had read something on a friend's refrigerator: "Practice random kindness and senseless acts of beauty." The phrase impressed her so much that she copied it down. 

Judy Foreman spotted the same phrase on a warehouse wall far away from home. When it stayed on her mind for days, she gave up and drove all the way back to copy it down. "I thought it was beautiful," she said, explaining why she'd taken to writing it at the bottom of all her letters, "like a message from above." Her husband, Frank, liked the phrase so much that h

### Now execute using zero-shot setup, no prompt

In [36]:
test_parameters={'example_entry': dev_set['data'][0],
                 'query_type': ZERO_SHOT_QUERY_TYPE,
                 'add_prompt': False,
                 'ask_transcription': False}

test_results_files.append(execute_test(os.path.basename(COQA_DEV_SET), dev_set, entries_to_test, test_parameters=test_parameters))

--------------------------------------------
QUESTION #0
--------------------------------------------

Text: It was a cold winter day. A woman drove up to the Rainbow Bridge tollbooth . "I'm paying for myself, and for the six cars behind me," she said with a smile, handing over seven tickets. One after another, the next six drivers arriving at the tollbooth were informed, "Some lady up ahead already paid your fare." 

It turned out that the woman, Natalie Smith, had read something on a friend's refrigerator: "Practice random kindness and senseless acts of beauty." The phrase impressed her so much that she copied it down. 

Judy Foreman spotted the same phrase on a warehouse wall far away from home. When it stayed on her mind for days, she gave up and drove all the way back to copy it down. "I thought it was beautiful," she said, explaining why she'd taken to writing it at the bottom of all her letters, "like a message from above." Her husband, Frank, liked the phrase so much that he pu

### Now, execute the tests using few-shot setup without prompt

In [None]:
test_parameters={'example_entry': dev_set['data'][0],
                 'query_type': FEW_SHOT_QUERY_TYPE,
                 'add_prompt': False,
                 'ask_transcription': False}

test_results_files.append(execute_test(os.path.basename(COQA_DEV_SET), dev_set, entries_to_test, test_parameters=test_parameters))

### And finaly execute using zero-shot setup, with prompt

In [None]:
test_parameters={'example_entry': dev_set['data'][0],
                 'query_type': ZERO_SHOT_QUERY_TYPE,
                 'add_prompt': True,
                 'ask_transcription': False}

test_results_files.append(execute_test(os.path.basename(COQA_DEV_SET), dev_set, entries_to_test, test_parameters=test_parameters))

### Now execute the evaluation script for the executed tests

Read the created data ― the runtime disconnected before running this final part...

In [57]:
REFERENCE_DATASET="reference_dataset_20230321_113331.json"

In [58]:
test_results_files

['test_openai_few_shot_20230321_192443.json']

In [16]:
test_results_files = ["test_llama_zero_shot_20230321_113512.json", 
                      "test_llama_few_shot_20230321_123831.json", 
                      "test_llama_zero_shot_20230321_133556.json", 
                      "test_llama_few_shot_20230321_142658.json"]

#### Comments on the evaluation script

The evaluation script computes Exact Match and F1 between the predicted answer and the gold standard.

One comment is that the model will be penalized if it produces verbose answers, even if it contains the correct answer.

In [59]:
for test_result in test_results_files:

    print("\n\n\n---------------------------------------------------")
    print("Evaluation results for {}...".format(test_result))
    print("---------------------------------------------------\n")

    with open(test_result) as inputFile:
        test_result_data = json.load(inputFile)

    with open("tmp_results.json", "w") as outputFile:
        json.dump(test_result_data['answers'], outputFile, indent=4)


    !python evaluate-v1.0.py --data-file {REFERENCE_DATASET} --pred-file tmp_results.json --human




---------------------------------------------------
Evaluation results for test_openai_few_shot_20230321_192443.json...
---------------------------------------------------

{
  "children_stories": {
    "em": 89.3,
    "f1": 95.3,
    "turns": 14
  },
  "literature": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "mid-high_school": {
    "em": 74.4,
    "f1": 87.0,
    "turns": 40
  },
  "news": {
    "em": 78.8,
    "f1": 85.8,
    "turns": 26
  },
  "wikipedia": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "reddit": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "science": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "in_domain": {
    "em": 78.4,
    "f1": 88.0,
    "turns": 80
  },
  "out_domain": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "overall": {
    "em": 78.4,
    "f1": 88.0,
    "turns": 80
  }
}
{
  "children_stories": {
    "em": 7.1,
    "f1": 45.0,
    "turns": 14
  },
  "literature": {
    "em": 0.0,
    "f1": 0.0,
    "tu