In [None]:
import json
from textwrap import dedent
from dotenv import load_dotenv
load_dotenv()

from openai import OpenAI
from datasets import load_dataset

In [None]:
client = OpenAI()

In [None]:
mmlu = load_dataset('lukaemon/mmlu', 'elementary_mathematics')
ds = mmlu['train']

In [None]:
system_prompt = {
    'role': 'system',
    'content': dedent('''
        You are an helpful assistant who produces synthetic data in the subject of {subject}.
        Always reply with question and 4 choices for the answers.
        Generate a new question in the similar fashion to what has been shared.
        You always output in JSON format.
    ''').strip().format(subject='elementary mathematics')
}

example_prompt = dedent('''
    Question: {question}

    Choices:
        A: {option_a}
        B: {option_b}
        C: {option_c}
        D: {option_d}
''').strip()

In [None]:
ds[0]

In [None]:
user_prompt = {
    'role': 'user',
    'content': example_prompt.format(
        question=ds[0]['input'],
        option_a=ds[0]['A'],
        option_b=ds[0]['B'],
        option_c=ds[0]['C'],
        option_d=ds[0]['D'],
    )
}

assistant_prompt = {
    'role': 'assistant',
    'content': '''
    {
    "Question": "Olivia used the rule 'Add 11' to create the number pattern shown below. 10, 21, 32, 43, 54. Which statement about the number pattern is true?",
    "Choices": {
        "A": "The 10th number in the pattern will be an even number.",
        "B": "The number pattern will never have two even numbers next to each other.",
        "C": "The next two numbers in the pattern will be an even number then an odd number.",
        "D": "If the number pattern started with an odd number then the pattern would have only odd numbers in it."
    }
    }
    '''
}

user_prompt1 = {
    'role': 'user',
    'content': example_prompt.format(
        question=ds[1]['input'],
        option_a=ds[1]['A'],
        option_b=ds[1]['B'],
        option_c=ds[1]['C'],
        option_d=ds[1]['D'],
    )
}

In [None]:
try:
    completions = client.chat.completions.create(
        model='gpt-3.5-turbo-1106',
        response_format={'type': 'json_object'},
        messages=[
            system_prompt,
            # user_prompt,
            # assistant_prompt,
            user_prompt1
        ]
    )

    if completions.choices[0].finish_reason == 'length':
        raise IOError('Reached maximum output length')

    try:
        op = json.loads(completions.choices[0].message.content)
    except:
        raise ValueError('Value returned by the model is not valid JSON')

    print(f'Tokens used" {completions.usage}')

except Exception as err:
    print(f'Error raised in accesing the API: {err}')

In [None]:
print(json.dumps(op, indent=4))

In [None]:
# parse into pydantic class (so that we can verify)
# and add to responses list

### Hindi articles generation

In [None]:
system_prompt = {
    'role': 'system',
    'content': dedent('''
        You are an helpful assistant who produces synthetic data in colloquial Devnagri Hindi.
        The data should be relevant to India.
    ''').strip()
}

user_prompt = {
    'role': 'user',
    'content': 'Generate a fictional story set in a school classroom (200-600 words)'
}

In [None]:
try:
    completions = client.chat.completions.create(
        model='gpt-4-1106-preview',
        # response_format={'type': 'json_object'},
        messages=[
            system_prompt,
            user_prompt,
        ]
    )

    if completions.choices[0].finish_reason == 'length':
        raise IOError('Reached maximum output length')

    try:
        op = json.loads(completions.choices[0].message.content)
    except:
        raise ValueError('Value returned by the model is not valid JSON')

    print(f'Tokens used" {completions.usage}')

except Exception as err:
    print(f'Error raised in accesing the API: {err}')

In [None]:
print(completions.usage)

In [None]:
print(completions.choices[0].message.content)

~1800 tokens = 90 seconds