In [34]:
MNLI_DIR = "./data/multinli_1.0"

import aiofiles
import asyncio
import json
import numpy as np
import os
import pandas as pd

from async_openai import OpenAI, ChatResponse
from dotenv import load_dotenv
from pathlib import Path

load_dotenv()


OpenAI.configure(
    api_key = os.getenv("OPENAI_API_KEY"),
    debug_enabled = False,
    timeout = 60
)

In [35]:
df_matched = pd.read_json(f'{MNLI_DIR}/multinli_1.0_dev_matched.jsonl', lines=True)
# df_matched.head()

--- Logging error in Loguru Handler #2 ---
Record was: None
Traceback (most recent call last):
  File "/home/erwin/.virtualenvs/thesis/lib/python3.10/site-packages/loguru/_handler.py", line 291, in _queued_writer
    message = queue.get()
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
TypeError: OpenAIError.__init__() missing 2 required positional arguments: 'response' and 'data'
--- End of logging error ---


In [36]:
splits = ['pir', 'sha', 'emb']
N_SPLITS = len(splits)

df_chunks = []
df_splits_by_genre = [np.array_split(y, N_SPLITS) for x, y in df_matched.groupby(df_matched.genre)]

for n_split in range(N_SPLITS):
    df_chunks.append(pd.concat([df_genre_splits[n_split] for df_genre_splits in df_splits_by_genre]))
    
# for df_chunk in df_chunks:
#     print([len(y) for x, y in df_chunk.groupby(df_chunk.genre)])

In [37]:
# for index, row in df_chunks[0].iterrows():
#     print(row['sentence1'], '###', row['sentence2'])

In [38]:
def generate_prompt(style):
    return f'The next message will contain a short text fragment. Rewrite the fragment using {style}, without changing the meaning of the text. Return two differing variations of the original text fragment. Format the output as following:\n\n1. {{First variation here}}\n2. {{Second variation here}}'

prompts = {
    'pir': generate_prompt("pirate speak"),
    'sha': generate_prompt("Shakespearean English"),
    'emb': generate_prompt("embellished, flowery English")
}

completion_data = []
for count, split in enumerate(splits):
    for index, row in df_chunks[count].iterrows():
        completion_data.append(
            {
                "split": split,
                "prompt": prompts[split],
                "pair_id": row['pairID'],
                "text": row['sentence1'],
                "sentence_no": 1
            }
        )
        completion_data.append(
            {
                "split": split,
                "prompt": prompts[split],
                "pair_id": row['pairID'],
                "text": row['sentence2'],
                "sentence_no": 2
            }
        )

print(completion_data[0]["prompt"])

The next message will contain a short text fragment. Rewrite the fragment using pirate speak, without changing the meaning of the text. Return two differing variations of the original text fragment. Format the output as following:

1. {First variation here}
2. {Second variation here}


In [58]:
p = Path('./results')
filenames = set('_'.join(pa.stem.split('_')[:2]) for pa in p.glob("*.json"))

filtered_completion_data = []
for data_point in completion_data:
    if f"{data_point['pair_id']}_sen{data_point['sentence_no']}" in filenames:
        continue
    filtered_completion_data.append(data_point)
    
print(len(filtered_completion_data))

0


In [40]:
# @backoff.on_exception(backoff.expo, openai.error.RateLimitError)
# def completions_with_backoff(**kwargs):
#     return openai.ChatCompletion.create(**kwargs)

# messages = [
#     {"role": "user", "content": f"The next message will contain a list of {N_LINES} text fragments. For each text fragment in the list, rewrite the fragment in pirate speak, whithout changing the meaning of the text. Create two variations for each list entry, and for each entry, format the results like this:\n<This text fragment is variation 1> <This text fragment is variation 2>"},
#     {"role": "user", "content": '\n'.join([f"{row['sentence1']}" for index, row in df_chunks[0].iterrows()][:N_LINES])}
# ]

# messages = []
# for index, row in df_chunks[0].iterrows():
#     messages.append(
#         [
#             {"role": "user", "content": f"The next message will contain a short text fragment. Rewrite the fragment in pirate speak, without changing the meaning of the text. Return two differing variations of the original text fragment."},
#             {"role": "user", "content": row['sentence1']}
#         ]
#     )

# print(messages[0])

In [57]:
async def chat_completion_to_file(split, prompt, pair_id, text, sentence_no):
    result = await OpenAI.chat.async_create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt},
            {"role": "user", "content": text}
        ]
    )
    async with aiofiles.open(f'results/{pair_id}_sen{sentence_no}_{split}.json', mode='w') as af:
        result_dict = result.dict()
        result_dict['created'] = result_dict['created'].isoformat()
        await af.write(json.dumps(result_dict, indent=4, default=str))

sem = asyncio.Semaphore(n := 40) # specify maximum concurrency

async def task_wrapper(args):
    try:
        await chat_completion_to_file(**args)
    finally:
        sem.release()

for args in filtered_completion_data: # may yield too many to list
    await sem.acquire() 
    asyncio.create_task(task_wrapper(args))

# wait for all tasks to complete
for i in range(n):
    await sem.acquire()

[1mINFO    [0m [32m2023-05-30 08:10:41.027[0m: [36mhttpx._client[0m:[36m_send_single_request[0m: [1mHTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
