In [None]:
import random
import json
import pandas as pd
import openai
from textwrap import dedent
from dotenv import load_dotenv
from datasets import load_from_disk, Dataset
import backoff
from tqdm import tqdm
from typing import List, Dict, Tuple

In [None]:
class GPTGenerator():
    def __init__(self, model_id) -> None:
        from dotenv import load_dotenv
        load_dotenv()

        self.client = openai.OpenAI()
        self.model_id = model_id

    @backoff.on_exception(backoff.expo, openai.RateLimitError, max_time=300)
    def __call__(self, messages: List[str], temperature: float = 1.4) -> Tuple[Dict, any]:

        completions = self.client.chat.completions.create(
            model=self.model_id,
            response_format={'type': 'json_object'},
            messages=messages,
            temperature=temperature,
            # max_tokens=2048
        )

        if completions.choices[0].finish_reason == 'length':
            raise IOError(f'Reached maximum output length, output format is not reliable. {completions.choices[0].message.content.strip()}')

        op = json.loads(completions.choices[0].message.content)

        # print(f'Prompts: {messages}, output: {op}')
        # print(f'Tokens used in generation using {self.model_id}: {completions.usage}')

        return op, completions.usage


## General MCQ
- Load the dataset
- Shuffle the options

In [None]:
mcq_data_path_part_1 = '../data/synthetic_data/20231228-1840/general_mcq/'

In [None]:
mcq_ds = load_from_disk(mcq_data_path_part_1)
print(mcq_ds.num_rows)

In [None]:
mcq_ds[0]

### Prompting
- Sys prompt: Answer the MCQ question, output only option
- User prompt: Question, options

In [None]:
sys_prompt = {
    'role': 'system',
    'content': dedent('''
    You are a helpful assistant who answers MCQ question.
    The passage will be in Hindi/Hinglish. The answer needs to be in the same language of question.
    Only output the single letter from (A/B/C/D) and always output in JSON. The fornat of the user query will be:

    Question
    ""

    Choices
    A.
    B.
    C.
    D.

    The correct answer is:
    ''').strip()
}

usr_content = dedent('''
    Question
    {question}

    Choices
    A. {A}
    B. {B}
    C. {C}
    D. {D}

    The correct answer is:
''').strip()

In [None]:
engine = GPTGenerator(model_id='gpt-4-1106-preview')

In [None]:
# mcq_validity = {
#     'OUTPUT': [],
#     'QUESTION': [],
#     'PROMPT': []
# }

ip_tokens, op_tokens = 0, 0

for i, elem in enumerate(tqdm(mcq_ds, total=mcq_ds.num_rows)):
    if i <= 159:train_ds
        continue
    try:
        usr_prompt = {
            'role': 'user',
            'content': usr_content.format(
                question=elem['QUESTION'],
                A=elem['A'],
                B=elem['B'],
                C=elem['C'],
                D=elem['D']
            )
        }
    
        messages = [sys_prompt, usr_prompt]
        output, tks = engine(messages, temperature=1)

        mcq_validity['QUESTION'].append(elem['QUESTION'])
        mcq_validity['PROMPT'].append(usr_prompt)
        mcq_validity['OUTPUT'].extend(list(output.values()))
    
        ip_tokens += tks.prompt_tokens
        op_tokens += tks.completion_tokens

        if i % 100 == 0:
            print(f'Input tokens: {ip_tokens}\tOutput tokens: {op_tokens}')

    except Exception as err:
        print(f'Error: {err}')
        continue

    finally:
        i += 1

In [None]:
len(mcq_validity['OUTPUT']), len(mcq_validity['QUESTION']), len(mcq_validity['PROMPT'])

In [None]:
ip_tokens, op_tokens

In [None]:
mcq_df = mcq_ds.to_pandas()

mcq_validity = pd.DataFrame(mcq_validity)

temp = mcq_validity.merge(mcq_df, on=['QUESTION'], how='inner')
print(temp.shape)

In [None]:
temp.query('TARGET == OUTPUT').shape

In [None]:
temp.query('TARGET != OUTPUT').shape

In [None]:
mcq_df

In [None]:
from datasets import Dataset
import random

In [None]:
# Assuming 'dataset' is your HuggingFace dataset
def add_column_with_probability(dataset, new_column_name, value1, value2, probability=0.8):
    new_values = [value1 if random.random() < probability else value2 for _ in range(len(dataset))]
    dataset = dataset.add_column(new_column_name, new_values)
    return dataset

# Example usage
export_ds = add_column_with_probability(mcq_ds, 'SPLIT', 'train', 'test', 0.8)

In [None]:
train_ds = export_ds.filter(lambda example: example['SPLIT'] == 'train')
train_ds = train_ds.filter(lambda example: example['TARGET'] in ['A', 'B', 'C', 'D'])

In [None]:
test_ds = export_ds.filter(lambda example: example['SPLIT'] == 'test')

In [None]:
train_ds.push_to_hub('cmeraki/eval_general_mcq', token='')

In [None]:
test_ds.push_to_hub('cmeraki/eval_general_mcq_private', token='')