In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
import json
from time import sleep
import nltk
import numpy as np
import argparse
from langchain.llms import OpenAI
import baseline_utils
from dotenv import load_dotenv
from types import SimpleNamespace
import asyncio

In [4]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List

In [5]:
pt = baseline_utils.create_prompt_template('0cot_gsm')


In [6]:
class GSM_Answer(BaseModel):
    work: str = Field(description="Explanation of answer")
    final_answer: str = Field(description="Final numeric answer")
    

In [7]:
parser = PydanticOutputParser(pydantic_object=GSM_Answer)

In [8]:
load_dotenv()

True

In [9]:
args = {
    'data_dir': '../../data/gsm_data',
    'save_dir': 'models',
    'debug': False,
    'exp_label': 'default',
    'task': '0cot_gsm',
    'model': 'gpt-3.5-turbo',
    'max_tokens': 2048,
    'temperature': 0.0,
}
args['ckpt_path'] = os.path.join(args['save_dir'], args['exp_label'])
args = SimpleNamespace(**args)

In [10]:
logger = baseline_utils.Logger(os.path.join(args.ckpt_path, 'log.txt'))
completed_rounds = 0

In [11]:
async def async_generate_answer(llm, prompt_template, problem):
    inp = prompt_template.format(context = problem['context'], format_instructions=parser.get_format_instructions())
    # print(inp)
    success = False
    while not success:
      try:
        output = await llm.agenerate([inp])
        print(output)
        success = True
      except Exception as e:
        logger.write(e)
        logger.write(f'API server overloaded. Waiting for 30 seconds...')
        sleep(30)
        continue
    problem['output'] = output.generations[0][0].text
    global completed_rounds
    completed_rounds += 1
    print(f"Completed {completed_rounds} rounds")


In [12]:
async def async_generate_answers(llm, prompt_template, problems):
  '''Generate the answer for the given problem.'''
  outputs = [async_generate_answer(llm, prompt_template, prob) for prob in problems]
  await asyncio.gather(*outputs)


In [None]:
def generate_answer(llm, prompt_template, problem):
    inp = prompt_template.format(context = problem['context'], format_instructions=parser.get_format_instructions())
    # print(inp)
    success = False
    while not success:
      try:
        output = llm.generate([inp])
        print(output)
        success = True
      except Exception as e:
        logger.write(e)
        logger.write(f'API server overloaded. Waiting for 30 seconds...')
        sleep(30)
        continue
    problem['output'] = output.generations[0][0].text
    global completed_rounds
    completed_rounds += 1

In [None]:
def generate_answers(llm, prompt_template, problems):
    for prob in problems:
        generate_answer(prob)

In [13]:
async def gsm_run(prompt_template, llm, data):
    global completed_rounds
    completed_rounds = 0
    problems = [{'context': d['input'], 'target': d['target']} for d in data]
    step = 10
    for i in range(0, len(problems), step):
        await async_generate_answers(llm, prompt_template, problems[i:min(i + step, len(problems))])
        print (f"Completed {i + step} problems")
    return problems

In [14]:
def calc_accuracy(problems):
    return sum([p['correct'] for p in problems]) / len(problems)

In [15]:
async def gsm_baseline():
    prompt_template = baseline_utils.create_prompt_template(args.task)
    llm = OpenAI(
        model_name=args.model,
        max_tokens=args.max_tokens,
        stop=['\\n\\n', 'A:', 'Q:'],
        temperature=args.temperature,
        openai_api_key = os.getenv('OPEN_AI_API_KEY')
  ) 
    for i in range(0, 1):
        for variant in ['irc']:
            data = baseline_utils.load_gsm_data(os.path.join(args.data_dir, f'gsmic_mixed_{i}_{variant}.jsonl'))
            if (os.path.exists(os.path.join(args.save_dir, f'gsmic_mixed_{i}_{variant}_output_{args.model}.json')) 
                or os.path.exists(os.path.join(args.save_dir, f'hand_gsmic_mixed_{i}_{variant}_output_{args.model}.json'))):
                continue
            problems = await gsm_run(prompt_template, llm, data)
            with open(os.path.join(args.save_dir, f'gsmic_mixed_{i}_{variant}_output_{args.model}.json'), 'w') as f:
                  f.write(json.dumps(problems) + '\n')
            for p in problems:
                print(p)
                p['final_answer'] = baseline_utils.parse_answer(p['output'])
                p['correct'] = p['final_answer'] == p['target']
            with open(os.path.join(args.save_dir, f'gsmic_mixed_{i}_{variant}_output_{args.model}.json'), 'w') as f:
                  f.write(json.dumps(problems) + '\n')
            logger.write(f'Accuracy for gsmic_mixed_{i}_{variant} = {calc_accuracy(problems)}')

In [16]:
await gsm_baseline()

Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 88056 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 88055 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Curre

generations=[[Generation(text='{"work": "Michelle paid $2 for the ride fee and $2.5 per mile for 4 miles, so her total cost was $12.", "final_answer": "$12"}', generation_info=None)]] llm_output={'token_usage': <OpenAIObject at 0x173710530> JSON: {
  "completion_tokens": 39,
  "prompt_tokens": 269,
  "total_tokens": 308
}, 'model_name': 'gpt-3.5-turbo'}
Completed 1 rounds
generations=[[Generation(text='The TV show was aired for 1.5 hours.\nThe commercials lasted a total of 10 minutes x 3 = 30 minutes.\nIn hours, the commercials lasted 30 minutes / 60 = 0.5 hours.\nTherefore, the TV show itself, not counting commercials, lasted 1.5 hours - 0.5 hours = 1 hour.\nThe final answer is 1 hour.', generation_info=None)]] llm_output={'token_usage': <OpenAIObject at 0x1737130b0> JSON: {
  "completion_tokens": 84,
  "prompt_tokens": 264,
  "total_tokens": 348
}, 'model_name': 'gpt-3.5-turbo'}
Completed 2 rounds
generations=[[Generation(text='{"work": "Helga tried on 7 pairs of shoes at the first s

Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 88556 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 88550 / min. Contact us through our help center at help.openai.com if you continue to have issues..


generations=[[Generation(text='First, we need to determine how many scoops of ice cream Oli has. We know that he has 4 scoops.\n\nNext, we need to determine how many scoops of ice cream Victoria has. We know that she has twice as many scoops as Oli. Therefore, Victoria has 8 scoops of ice cream.\n\nNow, we can calculate the difference between the number of scoops that Victoria has and the number of scoops that Oli has. Victoria has 4 more scoops of ice cream than Oli.\n\nTherefore, the final answer is 4. \n\n{\n  "work": "Oli has 4 scoops of ice cream and Victoria has twice as many scoops as Oli, which is 8 scoops. Therefore, the difference between the number of scoops that Victoria has and the number of scoops that Oli has is 4.",\n  "final_answer": "4"\n}', generation_info=None)]] llm_output={'token_usage': <OpenAIObject at 0x17341aed0> JSON: {
  "completion_tokens": 191,
  "prompt_tokens": 257,
  "total_tokens": 448
}, 'model_name': 'gpt-3.5-turbo'}
Completed 13 rounds
generations=[

Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 88525 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 88525 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Curre

generations=[[Generation(text='First, we need to find out how many oranges Sandra has. We know that Sandra has 3 times as many oranges as Betty, who has 12 oranges. So Sandra has 3 * 12 = 36 oranges.\n\nNext, we need to find out how many oranges Emily has. We know that Emily has 7 times as many oranges as Sandra, who has 36 oranges. So Emily has 7 * 36 = 252 oranges.\n\nFinally, we need to add the 5 pineapples that Emily\'s mother gave her to the total number of oranges Emily has. So Emily has 252 + 5 = 257 pieces of fruit in total.\n\n{"work": "Sandra has 36 oranges (3 times Betty\'s 12 oranges). Emily has 7 times as many oranges as Sandra, so she has 7 * 36 = 252 oranges. Emily\'s mother gave her 5 pineapples, so she has 252 + 5 = 257 pieces of fruit in total.", "final_answer": "257"}', generation_info=None)]] llm_output={'token_usage': <OpenAIObject at 0x17365cbf0> JSON: {
  "completion_tokens": 214,
  "prompt_tokens": 248,
  "total_tokens": 462
}, 'model_name': 'gpt-3.5-turbo'}
Com

Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 89078 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 89073 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Curre

generations=[[Generation(text='{"work": "Anna used 4 baking trays, each with 20 cupcakes, for a total of 80 cupcakes. She sold 3/5 of them, which is 48 cupcakes. Her brother\'s oven can hold 10000 trays, so she could potentially make 800000 cupcakes. However, she only made 48, so she earned $96 from selling them.", "final_answer": "96"}', generation_info=None)]] llm_output={'token_usage': <OpenAIObject at 0x173712270> JSON: {
  "completion_tokens": 85,
  "prompt_tokens": 267,
  "total_tokens": 352
}, 'model_name': 'gpt-3.5-turbo'}
Completed 23 rounds
generations=[[Generation(text='First, Kelly harvested a total of 55 + 101 + 78 = 234 carrots.\nNext, we need to convert the number of carrots to pounds by dividing by 6: 234 / 6 = 39 pounds.\nTherefore, Kelly harvested 39 pounds of carrots.\n{\n  "work": "Kelly harvested a total of 55 + 101 + 78 = 234 carrots. Next, we need to convert the number of carrots to pounds by dividing by 6: 234 / 6 = 39 pounds.",\n  "final_answer": "39"\n}', gene

Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 89601 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 89603 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Curre

generations=[[Generation(text='{\n  "work": "Rose needs to add up the cost of the paintbrush, paints, and easel.",\n  "final_answer": "Rose needs $11.00 more."\n}', generation_info=None)]] llm_output={'token_usage': <OpenAIObject at 0x17371e1b0> JSON: {
  "completion_tokens": 39,
  "prompt_tokens": 274,
  "total_tokens": 313
}, 'model_name': 'gpt-3.5-turbo'}
Completed 25 rounds
generations=[[Generation(text='Abie had 20 bags of chips.\nAbie gave 4 bags to her friend.\nAbie bought 6 more bags of chips.\nAbie\'s mother has 10 packets of candies.\nTo find out how many bags of chips Abie has in the end, we need to subtract the bags she gave away and add the bags she bought.\n20 - 4 + 6 = 22\nAbie has 22 bags of chips in the end.\n\n{"work": "20 - 4 + 6 = 22", "final_answer": "22"}', generation_info=None)]] llm_output={'token_usage': <OpenAIObject at 0x17371e1b0> JSON: {
  "completion_tokens": 116,
  "prompt_tokens": 254,
  "total_tokens": 370
}, 'model_name': 'gpt-3.5-turbo'}
Completed 26 

Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 10.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 88670 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 10.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Current: 88667 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.llms.openai.acompletion_with_retry.<locals>._completion_with_retry in 10.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-UL2nM4hjGA7CzFPYucOtFrPC on tokens per min. Limit: 90000 / min. Cu

generations=[[Generation(text='{"work": "Jennifer has a total of 1024 sweets (212 + 310 + 502). She wants to share them equally between herself and her 3 friends, which makes a total of 4 people. To find out how many sweets each person will get, we need to divide the total number of sweets by the number of people: 1024 ÷ 4 = 256. Therefore, Jennifer and her friends will each get 256 sweets.", "final_answer": "256"}', generation_info=None)]] llm_output={'token_usage': <OpenAIObject at 0x173539f10> JSON: {
  "completion_tokens": 102,
  "prompt_tokens": 255,
  "total_tokens": 357
}, 'model_name': 'gpt-3.5-turbo'}
Completed 29 rounds
generations=[[Generation(text='First, we need to calculate the total number of bicycle wheels in the garage. Since there are 4 bicycles and each bicycle has 2 wheels, there are a total of 8 wheels in the garage.\n\nNext, we need to calculate the total number of spokes. Since each wheel has 10 spokes, the total number of spokes in the garage is 8 * 10 = 80 spok

In [None]:
def grade(filepath):
    with open(filepath, 'r') as f:
        problems = json.load(f)
    for p in problems:
        if (p['target'] == p['final_answer']):
            p['correct'] = True
    with open(filepath, 'w') as f:
        f.write(json.dumps(problems) + '\n')
    print(calc_accuracy(problems))


In [None]:
grade(os.path.join(args.save_dir, f'gsmic_mixed_0_original_output_{args.model}.json'))