In [None]:
import json
from textwrap import dedent
from dotenv import load_dotenv
load_dotenv()

from openai import OpenAI
from datasets import load_dataset

In [None]:
client = OpenAI()

In [None]:
mmlu = load_dataset('lukaemon/mmlu', 'elementary_mathematics')
ds = mmlu['train']

In [None]:
system_prompt = {
    'role': 'system',
    'content': dedent('''
        You are an helpful assistant who produces synthetic data in the subject of {subject} in devnagri Hindi.
        Generate a new question different from what is shared but in the similar domain/.
        Always reply with a new question, 4 choices for the answers and the correct answer.
        You always output in JSON format.
    ''').strip().format(subject='elementary mathematics')
}

example_prompt = dedent('''
    Question: {question}

    Choices:
        A: {option_a}
        B: {option_b}
        C: {option_c}
        D: {option_d}

    Target: {target}
''').strip()

In [None]:
ds[1]

In [None]:
user_prompt = {
    'role': 'user',
    'content': example_prompt.format(
        question=ds[2]['input'],
        option_a=ds[2]['A'],
        option_b=ds[2]['B'],
        option_c=ds[2]['C'],
        option_d=ds[2]['D'],
        target=ds[2]['target']
    )
}

In [None]:
print([system_prompt, user_prompt])

In [None]:
try:
    completions = client.chat.completions.create(
        model='gpt-4-1106-preview',
        response_format={'type': 'json_object'},
        messages=[
            system_prompt,
            user_prompt,
            # assistant_prompt,
            # user_prompt1
        ]
    )

    if completions.choices[0].finish_reason == 'length':
        raise IOError('Reached maximum output length')

    try:
        op = json.loads(completions.choices[0].message.content)
    except:
        raise ValueError('Value returned by the model is not valid JSON')

    print(f'Tokens used" {completions.usage}')

except Exception as err:
    print(f'Error raised in accesing the API: {err}')

In [None]:
total_ques = 1000

time_to_response = 40
input_tokens = 200
output_tokens = 600

tok_ps = (input_tokens + output_tokens)/time_to_response

input_price = 0.01/1000
output_price = 0.03/1000


total_price = (input_price * input_tokens + output_price * output_tokens) * total_ques
total_time = total_ques * time_to_response

print(round(total_price, 2), round(total_time/3600))

In [None]:
op

In [None]:
# parse into pydantic class (so that we can verify)
# and add to responses list

### Hindi articles generation

In [None]:
system_prompt = {
    'role': 'system',
    'content': dedent('''
        You are an helpful assistant who produces synthetic data in colloquial Devnagri Hindi.
        The data should be relevant to India.
    ''').strip()
}

user_prompt = {
    'role': 'user',
    'content': 'Generate a fictional story set in a school classroom (200-600 words)'
}

In [None]:
try:
    completions = client.chat.completions.create(
        model='gpt-4-1106-preview',
        # response_format={'type': 'json_object'},
        messages=[
            system_prompt,
            user_prompt,
        ]
    )

    if completions.choices[0].finish_reason == 'length':
        raise IOError('Reached maximum output length')

    try:
        op = json.loads(completions.choices[0].message.content)
    except:
        raise ValueError('Value returned by the model is not valid JSON')

    print(f'Tokens used" {completions.usage}')

except Exception as err:
    print(f'Error raised in accesing the API: {err}')

In [None]:
print(completions.usage)

In [None]:
print(completions.choices[0].message.content)

~1800 tokens = 90 seconds