In [1]:
import numpy as np
import matplotlib.pyplot as plt

from openai import OpenAI
from dotenv import load_dotenv
from typing import List, Dict

from datasets import load_dataset

import utils as utils

In [2]:
load_dotenv()
client = OpenAI()

In [None]:
completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
  ]
)

print(completion.choices[0].message)

Hugginface open datasets

In [3]:
ultrachat = load_dataset('HuggingFaceH4/ultrachat_200k')
translate_ds = load_dataset('ai4bharat/IN22-Gen', 'eng_Latn-hin_Deva')
indic_qa = load_dataset('ai4bharat/IndicQA', 'indicqa.hi')
# hindi_ds = load_dataset('ai4bharat/IN22-Gen', 'hin_Deva')

In [4]:
raw_ds = ultrachat['train_sft'].map(
    utils.get_batch_tokens,
    num_proc=20
).map(utils.get_stringify_conversation, num_proc=20)

Map (num_proc=20):   0%|          | 0/207865 [00:00<?, ? examples/s]

In [None]:
translate_ds['gen']['context'][0], translate_ds['gen']['sentence_hin_Deva'][0]

In [9]:
example_english = """
user
Which famous landmarks should I visit in London, beyond the usual ones?

assistant
You can visit Leadenhall Market - a beautiful indoor market with stunning Victorian architecture, also used as a filming location in the Harry Potter films.

user
Hmm, this is an interesting suggestion, but I've already seen this landmark in London. Is there something more offbeat that you can recommend? Something that locals might know about?

assistant
Absolutely! Here is an offbeat and lesser-known place in London that locals might recommend: God's Own Junkyard - a neon wonderland filled with vintage and new neon signs.
There are many other hidden gems in London, and a quick Google search for ‘offbeat things in London’ will bring up many blogs and resources with more options.
"""

example_hindi = """
उपयोगकर्ता
लंदन में मुझे कौन से प्रसिद्ध स्थल देखने चाहिए, जो आमतौर पर नहीं होते हैं?

सहायक
आप लेडनहॉल मार्केट देख सकते हैं - एक सुंदर इंडोर मार्केट जिसमें आकर्षक विक्टोरियन वास्तुकला है, जिसे हैरी पॉटर फिल्मों में फिल्मांकन स्थल के रूप में भी इस्तेमाल किया गया है।

उपयोगकर्ता
हम्म, यह एक दिलचस्प सुझाव है, लेकिन मैंने लंदन में इस स्थल को पहले ही देख लिया है। क्या आप कुछ और अनोखा सुझाव दे सकते हैं? कुछ ऐसा जो स्थानीय लोगों को पता हो?

सहायक
बिल्कुल! यहां लंदन में एक अनोखा और कम जाना-पहचाना स्थल है जिसकी स्थानीय लोग सिफारिश कर सकते हैं: गॉड्स ओन जंकयार्ड - एक नियॉन वंडरलैंड जो पुराने और नए नियॉन साइन्स से भरा हुआ है।
लंदन में कई अन्य छुपे हुए रत्न हैं, और ‘लंदन में अनोखी चीजें’ के लिए गूगल सर्च करने पर आपको कई ब्लॉग्स और संसाधन मिल जाएंगे जिनमें और भी विकल्प होंगे।
"""

In [10]:
def get_translate_query(example: Dict, example_english, example_hindi):
    system_prompt = {
        "role": "system",
        "content": "You are an expert tranlator who traslates given text in English to Devnagri Hindi"
    }

    user_prompt = {
        "role": "user",
        "content": f"Translate {example_english} to Devnagri Hindi"
    }

    assistant_prompt = {
        "role": "assistant",
        "content": f"{example_hindi}"
    }

    translate_prompt = {
        "role": "user",
        "content": f'Translate {example["str_msg"]} to Devnagri Hindi'
    }

    example["text"] = [system_prompt, user_prompt, assistant_prompt, translate_prompt]

    return example

In [11]:
raw_ds = raw_ds.map(
    lambda x: get_translate_query(x, example_english, example_hindi),
    num_proc=20
)

Map (num_proc=20):   0%|          | 0/207865 [00:00<?, ? examples/s]

In [12]:
print(raw_ds['text'][0])

[{'content': 'You are an expert tranlator who traslates given text in English to Devnagri Hindi', 'role': 'system'}, {'content': "Translate \nuser\nWhich famous landmarks should I visit in London, beyond the usual ones?\n\nassistant\nYou can visit Leadenhall Market - a beautiful indoor market with stunning Victorian architecture, also used as a filming location in the Harry Potter films.\n\nuser\nHmm, this is an interesting suggestion, but I've already seen this landmark in London. Is there something more offbeat that you can recommend? Something that locals might know about?\n\nassistant\nAbsolutely! Here is an offbeat and lesser-known place in London that locals might recommend: God's Own Junkyard - a neon wonderland filled with vintage and new neon signs.\nThere are many other hidden gems in London, and a quick Google search for ‘offbeat things in London’ will bring up many blogs and resources with more options.\n to Devnagri Hindi", 'role': 'user'}, {'content': '\nउपयोगकर्ता\nलंदन 

In [None]:
raw_eng_ds = translate_ds['gen']['sentence_eng_Latn'][0:1000]
raw_hind_ds = translate_ds['gen']['sentence_hin_Deva'][0:1000]

In [None]:
eng_toks = []

for e in raw_eng_ds:
    eng_toks.append(utils.get_tokens_from_messages([e]))

hin_toks = []

for e in raw_hind_ds:
    hin_toks.append(utils.get_tokens_from_messages([e]))

mult = np.divide(hin_toks, eng_toks)

In [None]:
plt.hist(eng_toks, label='eng token')
plt.hist(hin_toks, label='hin toens')
plt.legend()
plt.show()

In [None]:
np.mean(mult), np.percentile(mult, 95), np.percentile(mult, 50)

In [None]:
plt.hist(mult)
plt.show()

On an average, the hindi translation outputs 4-5x number of tokens. So the total number of tokens in a single message needs to be:
- prompt len (p)
- conversation len in english (x)
- expected response len in hindi (5x)

p + x + 5x < 4096

In [None]:
def prepare_gpt_translate_prompts(ex_eng, ex_hi, src_msg: List, token_limit: int = 4069):
    """
    Prepare a single prompt for input to GPT
    """

    translate_prompt = """This is a conversation between the User and the Assistant in English.
    
    {example_english}
    
    This is the translation of the above conversation in Devnagri Hindi
    
    {example_hindi}
    
    Similary translate the following conversation from English to Devnagri Hindi
    
    """.format(
        example_english=ex_eng,
        example_hindi=ex_hi
    )

    completion_prompt = translate_prompt
    p = utils.get_tokens_from_messages([completion_prompt])
    print(f'Prompt length: {p}')

    for elem in src_msg:
        flat_elem = utils.stringify(elem)
        x = utils.get_tokens_from_messages([flat_elem])
        c = utils.get_tokens_from_messages([completion_prompt])

        print(f'Conversation length until now: {x+c}')

        if 6*x + c > token_limit:
            yield completion_prompt
            completion_prompt = translate_prompt
        
        completion_prompt += flat_elem

    yield completion_prompt

In [None]:
msg_to_gpt = prepare_gpt_translate_prompts(
    ex_eng=example_english,
    ex_hi=example_hindi,
    src_msg=ultrachat['train_sft']['messages'][2],
    token_limit=4096
)

In [None]:
all_msgs = []

for e in msg_to_gpt:
    all_msgs.append(e)

In [None]:
utils.get_tokens_from_messages()