In [None]:
import json
from textwrap import dedent
from dotenv import load_dotenv
load_dotenv()

from openai import OpenAI
from datasets import load_dataset

In [None]:
client = OpenAI()

In [None]:
mmlu = load_dataset('lukaemon/mmlu', 'elementary_mathematics')
ds = mmlu['train']

In [None]:
system_prompt = {
    'role': 'system',
    'content': dedent('''
        You are an helpful assistant who produces synthetic data in devnagri Hindi.
        Generate a new question for 8th grade {subject} exam.
        Always reply with a new question, 4 choices for the answers and the correct answer.
        You always output in JSON format.
    ''').strip().format(subject='elementary mathematics')
}

example_prompt = dedent('''
    Question: {question}

    Choices:
        A: {option_a}
        B: {option_b}
        C: {option_c}
        D: {option_d}

    Target: {target}
''').strip()

In [None]:
ds[1]

In [None]:
user_prompt = {
    'role': 'user',
    'content': example_prompt.format(
        question=ds[2]['input'],
        option_a=ds[2]['A'],
        option_b=ds[2]['B'],
        option_c=ds[2]['C'],
        option_d=ds[2]['D'],
        target=ds[2]['target']
    )
}

In [None]:
print(system_prompt['content'])

In [None]:
{
    "question": <>,
    "A": <>,

    "TARGET": <>
}

- dataset save

In [None]:
print(user_prompt['content'])

In [None]:
user_prompt = {
    'role': 'user',
    'content': dedent("""
    {'QUESTION': <>, 'A': <>, 'B': <>, 'C': <>, 'D': <>, 'TARGET': <>}
    """).strip()
}

In [None]:
print([system_prompt, user_prompt])

In [None]:
try:
    completions = client.chat.completions.create(
        model='gpt-4-1106-preview',
        response_format={'type': 'json_object'},
        messages=[
            system_prompt,
            user_prompt
        ],
        temperature=1.5
    )

    if completions.choices[0].finish_reason == 'length':
        raise IOError('Reached maximum output length')

    try:
        op = json.loads(completions.choices[0].message.content)
    except:
        raise ValueError('Value returned by the model is not valid JSON')

    print(f'Tokens used" {completions.usage}')

except Exception as err:
    print(f'Error raised in accesing the API: {err}')

In [None]:
print(op)

In [None]:
total_ques = 1000

time_to_response = 40
input_tokens = 200
output_tokens = 600

tok_ps = (input_tokens + output_tokens)/time_to_response

input_price = 0.01/1000
output_price = 0.03/1000


total_price = (input_price * input_tokens + output_price * output_tokens) * total_ques
total_time = total_ques * time_to_response

print(round(total_price, 2), round(total_time/3600))

In [None]:
op

In [None]:
# parse into pydantic class (so that we can verify)
# and add to responses list

### Hindi articles generation

In [None]:
system_prompt = {
    'role': 'system',
    'content': dedent('''
        You are an helpful assistant who produces synthetic data in colloquial Devnagri Hindi.
        The data should be relevant to India.
    ''').strip()
}

user_prompt = {
    'role': 'user',
    'content': 'Generate a fictional story set in a school classroom (200-600 words)'
}

In [None]:
try:
    completions = client.chat.completions.create(
        model='gpt-4-1106-preview',
        # response_format={'type': 'json_object'},
        messages=[
            system_prompt,
            user_prompt,
        ]
    )

    if completions.choices[0].finish_reason == 'length':
        raise IOError('Reached maximum output length')

    try:
        op = json.loads(completions.choices[0].message.content)
    except:
        raise ValueError('Value returned by the model is not valid JSON')

    print(f'Tokens used" {completions.usage}')

except Exception as err:
    print(f'Error raised in accesing the API: {err}')

In [None]:
print(completions.usage)

In [None]:
print(completions.choices[0].message.content)

~1800 tokens = 90 seconds

In [None]:
import json
import openai
from typing import Tuple
from configs.synthetic_dataset import GenerationConfiguration, synthetic_dataset_models

In [None]:
class GPTGenerator():
    def __init__(self, model_id) -> None:
        from dotenv import load_dotenv
        load_dotenv()

        self.client = openai.OpenAI()
        self.model_id = model_id

    def __call__(self, system_prompt: str, user_prompt: str = None, temperature: float = 1.4) -> Tuple:
        messages = [system_prompt]
        if user_prompt:
            messages.append(user_prompt)
        print('here')
        completions = self.client.chat.completions.create(
            model=self.model_id,
            response_format={'type': 'json_object'},
            messages=messages,
            temperature=temperature
        )
        print('here')

        if completions.choices[0].finish_reason == 'length':
            raise IOError(
                'Reached maximum output length, output format is not reliable')

        op = json.loads(completions.choices[0].message.content)

        print(f'Tokens used in generation using {self.model_id}: {completions.usage}')
        print(f'Completion output from Open AI APIs: {completions}')

        return op, completions.usage

In [None]:
generator = GPTGenerator('gpt-3.5-turbo-1106')

total_usage = {
    'input': 0,
    'output': 0,
}
generated_dataset = []

In [None]:
for _, synth_ds in synthetic_dataset_models.items():
    print(f'Generating synthetic dataset for {synth_ds.name}')

    try:
        sys_prompt = {
            'role': 'system',
            'content': synth_ds.system_prompt.format(
                language='Devnagri Hindi',
                subject='Physics',
                grade='8th Grade',
                topic='Basic Forces and Motion',
                required_format=synth_ds.required_format
            )
        }
        datapoint, usage = generator(
            system_prompt=sys_prompt,
            temperature=GenerationConfiguration.temperature
        )
        total_usage['input'] += usage.prompt_tokens
        total_usage['output'] += usage.completion_tokens

        print(f'Return datapoint: {datapoint}')

        assert synth_ds.response_model.model_validate(datapoint), "Response by the model is not in the valid dataform"
        generated_dataset.append(datapoint)

    except openai.RateLimitError as err:
        print(f'Reached rate limit: {err}')
        break
    except Exception as err:
        print(f'Raised error: {err}')

    print(f'Used cumulative tokens: {total_usage}')

In [None]:
generated_dataset

In [None]:
from datasets import Dataset

In [None]:
ds = Dataset.from_list(generated_dataset)

In [None]:
ds[0]