In [None]:
import random
import json
import pandas as pd
import openai
from textwrap import dedent
from dotenv import load_dotenv
from datasets import load_from_disk, Dataset
import backoff
from tqdm import tqdm
from typing import List, Dict, Tuple

In [None]:
class GPTGenerator():
    def __init__(self, model_id) -> None:
        from dotenv import load_dotenv
        load_dotenv()

        self.client = openai.OpenAI()
        self.model_id = model_id

    @backoff.on_exception(backoff.expo, openai.RateLimitError, max_time=300)
    def __call__(self, messages: List[str], temperature: float = 1.4) -> Tuple[Dict, any]:

        completions = self.client.chat.completions.create(
            model=self.model_id,
            response_format={'type': 'json_object'},
            messages=messages,
            temperature=temperature,
            # max_tokens=2048
        )

        if completions.choices[0].finish_reason == 'length':
            raise IOError(f'Reached maximum output length, output format is not reliable. {completions.choices[0].message.content}')

        op = json.loads(completions.choices[0].message.content)

        # print(f'Prompts: {messages}, output: {op}')
        # print(f'Tokens used in generation using {self.model_id}: {completions.usage}')

        return op, completions.usage


## Retrieval dataset
- Load the generated dataset
- Standardize the schema of the dataset
- Schema of the dataset
    - QUESTION (str)
    - TYPE (enum <'fill_in_the_blanks', 'true_false', 'mcq'>)
    - CHOICES (list)
    - TARGET (0-indexed option from the CHOICES list)
    - PASSAGE_HASH (hash of the passage id)

In [None]:
raw_articles_path = '../data/retrieval/cleaned_dataset/dataset.jsonl'
generated_data_path = '../data/synthetic_data/20231229-1712/retrieval_questions/dataset.jsonl'

In [None]:
def load_from_jsonl(path):
    with open(path, 'r') as fp:
        raw = fp.read()
        data = []

    for ln in raw.split('\n'):
        if not ln:
            continue
        data.append(json.loads(ln))

    print(f'Loaded dataset with {len(data)} rows')

    return pd.DataFrame(data)

In [None]:
articles_ds = load_from_jsonl(raw_articles_path)
synthetic_ds = load_from_jsonl(generated_data_path)

In [None]:
articles_ds.drop_duplicates(subset=['link'], inplace=True)
synthetic_ds.drop(synthetic_ds.columns[-2:], axis=1, inplace=True)

In [None]:
synthetic_ds.columns

In [None]:
# Mapping multiple catetgories to a standard category
type_mapper = {
    'Fill in the Blank': 'fill_in_the_blanks',
    'True/False': 'true_false',
    'MCQ': 'mcq',
    'FILL_BLANK': 'fill_in_the_blanks',
    'TRUE_FALSE': 'true_false',
    'Fill in the blank': 'fill_in_the_blanks',
    'One word fill in the blanks': 'fill_in_the_blanks',
    'Fill in the blanks': 'fill_in_the_blanks',
    'FillInBlank': 'fill_in_the_blanks',
    'TrueFalse': 'true_false',
    'FILL IN THE BLANK': 'fill_in_the_blanks',
    'One Word': 'fill_in_the_blanks',
    'One word': 'fill_in_the_blanks',
    'One_word_fill_in_the_blanks': 'fill_in_the_blanks',
    'FILL_IN_THE_BLANKS': 'fill_in_the_blanks',
    'One-word': 'fill_in_the_blanks',
    'Fill in the Blanks': 'fill_in_the_blanks',
    'One word Fill in the blank': 'fill_in_the_blanks',
    'FILL_IN_THE_BLANK': 'fill_in_the_blanks',
    'Unknown Type': 'mcq',
    'Nonsensical Instructions,(waitlisting_unset.dispatch_profile_update)%  <Give_Statics_of_ShareMarket>': 'mcq',
    'Unintelligible Question': 'mcq',
    'False assumptions premise': 'true_false',
    'Truth/Own assumptions': 'true_false',
    'Fill-in-the-blank': 'fill_in_the_blanks',
    'Text': 'mcq',
    'TRUE/FALSE': 'true_false',
    'Solution_Seeker_TrueFalse': 'true_false',
    'One Word Fill In The Blanks': 'fill_in_the_blanks',
    'FillInTheBlank': 'fill_in_the_blanks',
    'FILL IN THE BLANKS': 'fill_in_the_blanks',
    'One Word Fill in the blanks': 'fill_in_the_blanks',
    'Fill-in-the-Blank': 'fill_in_the_blanks'
}

synthetic_ds['CLEANED_TYPE'] = synthetic_ds.TYPE.apply(lambda x: type_mapper[x])

In [None]:
merged_ds = synthetic_ds.merge(
    articles_ds,
    left_on='PASSAGE_LINK',
    right_on='link',
    how='inner'
)

In [None]:
# If choices is NA, drop for MCQ and fill in the blanks
# If the type is not TRUE_FALSE and choice array is lt 2 then drop
print(f'Initial size: {merged_ds.shape}')
print(merged_ds.query("CHOICES.isna()").groupby('CLEANED_TYPE').size())

# Length of the CHOICES array
def arr_len(x):
    if x is None or type(x) != list:
        return -1
    return len(x)

merged_ds['CHOICES_LEN'] = merged_ds.CHOICES.apply(arr_len)

merged_ds = merged_ds.query("(CHOICES.notna() and CHOICES_LEN >=2) or CLEANED_TYPE == 'true_false'")
print(f'Final size: {merged_ds.shape}')

print(merged_ds.groupby(['CLEANED_TYPE', 'CHOICES_LEN']).size())

In [None]:
standardizer = GPTGenerator(model_id='gpt-4-1106-preview')

In [None]:
sys_prompt = {
    'role': 'system',
    'content': dedent('''
    You are helpful assistant who is an expert in data cleaning
    You will be provided with a JSON with the schema:
    {'CHOICES': <>, 'TARGET': <>}
    
    You have to return a cleaned and standardized JSON with the schema.
    {'CHOICES': <>, 'TARGET': <>, 'CLEAN_CHOICES': <>, 'CLEAN_TARGET': <>}
    
    Clean the JSON provided following the rules:
    - Return [True, False] in the CLEAN_CHOICES key
    - TARGET can either be the correct index in the 0th indexed list CHOICES or not given
    - If TARGET is not given, it needs to be inferred from the given TARGET and CHOICES and populated into CLEAN_TARGET
    - The CLEAN_TARGET might not always be 0 

    Return with the cleaned JSON only
    ''').strip()
}

ex1_prompt = {
    'role': 'user',
    'name': 'example_user1',
    'content': "{'CHOICES': [सत्य, असत्य], 'TARGET': 'असत्य'}"
}

ex2_prompt = {
    'role': 'assistant',
    'name': 'example_user2',
    'content': "{'CHOICES': [सत्य, असत्य], 'TARGET': 'असत्य', 'CLEAN_CHOICES': [True, False], 'CLEAN_TARGET': 1}"
}

In [None]:
standard_data = []
ip_tokens, op_tokens = 0, 0

for i, elem in tqdm(enumerate(merged_ds.query('CLEANED_TYPE == "true_false"').itertuples()), total=merged_ds.query('CLEANED_TYPE == "true_false"').shape[0]):
    try:
        usr_prompt = {
            'role': 'user',
            'content': json.dumps({
                'CHOICES': elem.CHOICES,
                'TARGET': elem.TARGET
            }, ensure_ascii=False)
        }
        op, tks = standardizer(
            [sys_prompt, ex1_prompt, ex2_prompt, usr_prompt]
        )
        op.update({'QUESTION': elem.QUESTION})
        standard_data.append(op)

        ip_tokens += tks.prompt_tokens
        op_tokens += tks.completion_tokens

        if i % 100 == 0:
            print(f'Input tokens: {ip_tokens}\tOutput tokens: {op_tokens}')

        # if i >= 5:
        #     break
    except Exception as err:
        print(f'Error: {err} at {elem.Index}')

print(f'Input tokens: {ip_tokens}\tOutput tokens: {op_tokens}')

In [None]:
standard_ds = pd.DataFrame(standard_data)
standard_ds = standard_ds.merge(merged_ds, on='QUESTION', how='inner')

In [None]:
standard_ds = standard_ds.query('CLEAN_TARGET.notna() and CLEAN_TARGET < 2')

In [None]:
# manually clean [27, 98]
standard_ds.loc[98, 'CLEAN_TARGET'] = 0

### Verifying data
- Send passage and question to GPT for QnA
- Return one word answer
- Verify with existing data

In [None]:
prompt_ques_mapper = {
    'fill_in_the_blanks': 'Answer this fill in the blanks question from the choices given only',
    'true_false': 'Answer this True/False question in one word (True or False)',
    'mcq': 'Answer this MCQ question by selecting the answer from the choices given only'
    
}

sys_prompt = {
    'role': 'system',
    'content': dedent('''
    You are a helpful assistant who answer questions based on the given passage.
    The passage will be in Hindi/Hinglish. The answer needs to be in the same language of question. There can be multiple questions.
    The format of the user query will be:

    Passage
    ""

    Question 1
    ""

    Choices 1
    ""

    Task 1
    ""

    Always output in JSON {'1': <>, '2': <>}
    Answer -1 if the question is nonsensical and choose from Choices if the choices are given.
    ''').strip()
}

usr_content = dedent('''
    Question {i}
    {question}

    Choices {i}
    {choices}

    Task {i}
    {task}

''')

In [None]:
ex = merged_ds.query("CLEANED_TYPE=='fill_in_the_blanks'").sample(n=1).iloc[0]
ex

In [None]:
engine = GPTGenerator(model_id='gpt-3.5-turbo-1106')

In [None]:
usr_prompt = {
    'role': 'user',
    'content': usr_content.format(
        passage=ex.content,
        question=ex.QUESTION,
        choices=ex.CHOICES,
        task=prompt_ques_mapper[ex.CLEANED_TYPE]
    )
}

op, tkns = engine([
        sys_prompt,
        usr_prompt
    ], temperature=1
)

In [None]:
validity = {
    'output': [],
    'link': [],
    'question': []
}

ip_tokens, op_tokens = 0, 0
i = 0

for gp in tqdm(merged_ds.groupby(['link']), total=len(merged_ds.groupby(['link']))):
    try:
        all_quest = '''
        Passage
        {passage}
        
        '''.format(passage=gp[1]['content'].iloc[0])

        links, ques = [], []
        for idx, q in enumerate(gp[1].itertuples()):
            all_quest += usr_content.format(
                i=idx+1,
                question=q.QUESTION,
                choices=random.sample(q.CHOICES, len(q.CHOICES)) if q.CHOICES else q.CHOICES,
                task=prompt_ques_mapper[q.CLEANED_TYPE]
            )

            links.append(gp[0][0])
            ques.append(q.QUESTION)
    
    
        usr_prompt = {
            'role': 'user',
            'content': dedent(all_quest.strip())
        }
    
        messages = [sys_prompt, usr_prompt]
        output, tks = engine(messages, temperature=1)
    
        validity['output'].extend(list(output.values()))
        validity['link'].extend(links)
        validity['question'].extend(ques)
    
        ip_tokens += tks.prompt_tokens
        op_tokens += tks.completion_tokens

        if i % 100 == 0:
            print(f'Input tokens: {ip_tokens}\tOutput tokens: {op_tokens}')

    except Exception as err:
        print(f'Error: {err}')
        continue

    finally:
        i += 1

In [None]:
validity = pd.DataFrame(validity)

In [None]:
# Join with the existing dataset
op = merged_ds.merge(validity, right_on=['link', 'question'], left_on=['link', 'QUESTION'], how='inner')
op['CORRECT'] = False

In [None]:
op.query('CORRECT == True').shape, op.query('CORRECT == False').shape

In [None]:
output_to_target_idx = [2, 37]

### Cleaning and uploading
- Remove incorrectly formatted QnA
- Merge all datasets together
- Push to hub

In [None]:
import random
import hashlib
from copy import deepcopy
from datasets import Dataset

In [None]:
def hash_url(url):
    hash_object = hashlib.sha256()
    hash_object.update(url.encode())
    hex_digest = hash_object.hexdigest()

    return hex_digest

In [None]:
part1 = deepcopy(merged_ds.query('CLEANED_TYPE != "true_false"'))
part2 = deepcopy(standard_ds)

In [None]:
part1.loc[:, 'PASSAGE_HASH'] = part1.PASSAGE_LINK.apply(lambda x: hash_url(x))
part2.loc[:, 'PASSAGE_HASH'] = part2.PASSAGE_LINK.apply(lambda x: hash_url(x))

In [None]:
common_cols = ['PASSAGE', 'QUESTION', 'TYPE', 'CHOICES', 'TARGET', 'PASSAGE_HASH']

In [None]:
part1 = part1[['content', 'QUESTION', 'CLEANED_TYPE', 'CHOICES', 'TARGET', 'ENCODED_ID']]
part1.columns = common_cols

In [None]:
part2 = part2[['content', 'QUESTION', 'CLEANED_TYPE', 'CLEAN_CHOICES', 'CLEAN_TARGET', 'ENCODED_ID']]
part2.columns = common_cols

In [None]:
final_df = pd.concat([part1, part2], ignore_index=True)
final_df.reset_index(inplace=True, drop=True)

In [None]:
test_hash = random.sample(list(final_df.PASSAGE_HASH.unique()), 10)
train_hash = [x for x in list(final_df.PASSAGE_HASH.unique()) if x not in test_hash]

In [None]:
len(test_hash), len(train_hash)

In [None]:
test_df = final_df.query('PASSAGE_HASH in @test_hash').reset_index(drop=True)
train_df = final_df.query('PASSAGE_HASH in @train_hash').reset_index(drop=True)

print(test_df.shape, train_df.shape)

In [None]:
# One time
train_df.drop(50, inplace=True)
train_df.reset_index(inplace=True, drop=True)

In [None]:
for c in train_df.columns:
    train_df[c] = train_df[c].astype(str)
    test_df[c] = test_df[c].astype(str)

train_df['TARGET'] = train_df['TARGET'].astype(int)
test_df['TARGET'] = test_df['TARGET'].astype(int)

In [None]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:
train_ds.push_to_hub('cmeraki/hindi_eval_retrieval', token='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')

In [None]:
test_ds.push_to_hub('cmeraki/hindi_eval_retrieval_private', token='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')