In [1]:
import json
import pickle
from tqdm import tqdm
from spacy.lang.hi import Hindi
from spacy.lang.en import English
from collections import Counter
import pickle

In [2]:

w2v_hi=pickle.load(open("./models/w2v_hi.bin","rb"))
w2v_en=pickle.load(open("./models/w2v_en.bin","rb"))

In [64]:
def generate_stem_words(word):
    suffixes = {
        1: [u"ो",u"े",u"ू",u"ु",u"ी",u"ि",u"ा"],
        2: [u"कर",u"ाओ",u"िए",u"ाई",u"ाए",u"ने",u"नी",u"ना",u"ते",u"ीं",u"ती",u"ता",u"ाँ",u"ां",u"ों",u"ें"],
        3: [u"ाकर",u"ाइए",u"ाईं",u"ाया",u"ेगी",u"ेगा",u"ोगी",u"ोगे",u"ाने",u"ाना",u"ाते",u"ाती",u"ाता",u"तीं",u"ाओं",u"ाएं",u"ुओं",u"ुएं",u"ुआं"],
        4: [u"ाएगी",u"ाएगा",u"ाओगी",u"ाओगे",u"एंगी",u"ेंगी",u"एंगे",u"ेंगे",u"ूंगी",u"ूंगा",u"ातीं",u"नाओं",u"नाएं",u"ताओं",u"ताएं",u"ियाँ",u"ियों",u"ियां"],
        5: [u"ाएंगी",u"ाएंगे",u"ाऊंगी",u"ाऊंगा",u"ाइयाँ",u"ाइयों",u"ाइयां"],
    }
    
    for L in suffixes:
        if len(word) > L + 1:
            for suf in suffixes[L]:
                #print type(suf),type(word),word,suf
                if word.endswith(suf):
                    #print 'h'
                    return word[:-L]
    return word

In [3]:
len(w2v_en)

2518768

In [4]:
len(w2v_hi)

157947

In [5]:
nlp_en = English()
nlp_hi = Hindi()

tok_hi = nlp_hi.tokenizer
tok_en = nlp_en.tokenizer

nlp_en.add_pipe('sentencizer')
nlp_hi.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7fd322260c80>

In [6]:
vocab_hi=pickle.load(open("./models/vocab_hi.bin","rb"))
vocab_en=pickle.load(open("./models/vocab_en.bin","rb"))

eng_stories=json.load(open("/Users/ppuser/kllama/tinystories/stories_unique_50.json"))
hin_stories=json.load(open("/Users/ppuser/kllama/hinllama/data/hindi_dataset/stories.json","r"))

print(f"""
English vocabulary size: {len(vocab_en)}
Hindi vocabulary size: {len(vocab_hi)}
""")


English vocabulary size: 51225
Hindi vocabulary size: 22934



In [7]:
len(eng_stories)

41346

In [22]:
# Find unique words in English corpus using spacy tokenizer
# from spacy.lang.en import English
# from collections import Counter

# nlp_en = English()
# tok_en = nlp_en.tokenizer
# vocab_en=Counter()

# for k in tqdm(eng_stories):
#     story=eng_stories[k]
#     doc=tok_en(story)
#     vocab=[x.text for x in doc]
#     vocab_en.update(vocab)

In [8]:
class Kun:
    def __init__(self):
        self.maps={}
        self.docs={}
    
    def update(self,k,v):
        if k not in self.maps:
            self.maps[k]=Counter()
            self.docs[k]=0
        self.maps[k].update(set(v))
        self.docs[k]= self.docs[k]+1

In [98]:
mapping_dict_en=Kun()
mapping_dict_hi=Kun()
consistent_ids=[]
eng_tokens_idf=Counter()
hi_tokens_idf=Counter()
for idx in tqdm(hin_stories):
    hi_story=hin_stories[idx]
    en_story=eng_stories[idx]
    
    en_sentences=[x.text for x in list(nlp_en(en_story).sents)]
    hi_sentences=[x.text for x in list(nlp_hi(hi_story).sents)]

    if len(en_sentences)==len(hi_sentences):
        consistent_ids.append(idx)
        for e,h in zip(en_sentences,hi_sentences):
            etokens=[x.text for x in tok_en(e)]
            htokens=[x.text for x in tok_hi(h)]
            for t_ in htokens:
                mapping_dict_hi.update(t_,etokens)
            for t_ in etokens:
                mapping_dict_en.update(t_,htokens)
            eng_tokens_idf.update(list(set(etokens)))
            hi_tokens_idf.update(list(set(htokens)))

  0%|          | 0/19999 [00:00<?, ?it/s]

100%|██████████| 19999/19999 [03:16<00:00, 101.72it/s]


In [102]:
len(eng_tokens_idf)

26970

In [100]:
len(hi_tokens_idf)

22075

In [95]:
truncated_dict_en={}
for k in tqdm(mapping_dict_en.maps):
    truncated_dict_en[k]=mapping_dict_en.maps[k].most_common(15)
    # truncated_dict[k]=[(x[0],x[1]*1./eng_tokens_idf[x[0]]) for x in truncated_dict[k]]
    # truncated_dict[k]=sorted(truncated_dict[k],key=lambda x: -x[1])[:15]

truncated_dict_hi={}
for k in tqdm(mapping_dict_hi.maps):
    truncated_dict_hi[k]=mapping_dict_hi.maps[k].most_common(15)

  0%|          | 31/26970 [00:00<01:26, 309.70it/s]

100%|██████████| 26970/26970 [00:05<00:00, 5047.18it/s] 
100%|██████████| 22075/22075 [00:01<00:00, 11534.85it/s]


In [97]:
truncated_dict_en

{'One': [('एक', 15141),
  ('।', 14477),
  ('दिन', 14317),
  (',', 12446),
  ('में', 5240),
  ('के', 4619),
  ('और', 3709),
  ('ने', 3628),
  ('की', 2915),
  ('का', 2755),
  ('पर', 2637),
  ('को', 2561),
  ('माँ', 2466),
  ('उसने', 2257),
  ('से', 2225)],
 'day': [('दिन', 26070),
  ('।', 24928),
  (',', 18490),
  ('एक', 18101),
  ('और', 9998),
  ('के', 8698),
  ('में', 8474),
  ('से', 6199),
  ('ने', 5380),
  ('को', 5053),
  ('वह', 4976),
  ('था', 4879),
  ('की', 4633),
  ('का', 4256),
  ('उस', 4069)],
 ',': [(',', 161515),
  ('।', 146043),
  ('"', 70274),
  ('और', 67325),
  ('एक', 57931),
  ('ने', 49665),
  ('है', 47144),
  ('कहा', 39536),
  ('के', 35488),
  ('की', 34943),
  ('वह', 30007),
  ('में', 29993),
  ('से', 29812),
  ('था', 28491),
  ('को', 26501)],
 'a': [('।', 99165),
  ('एक', 86431),
  (',', 59066),
  ('है', 40690),
  ('की', 38257),
  ('और', 36045),
  ('था', 30765),
  ('बात', 25661),
  ('थी', 24090),
  ('बार', 23928),
  ('के', 22467),
  ('में', 20926),
  ('नाम', 20881),
  (

In [96]:
truncated_dict_hi

{'एक': [('.', 105607),
  ('a', 92373),
  (',', 70356),
  ('was', 41669),
  ('\n', 35343),
  ('and', 35079),
  ('the', 31731),
  ('there', 28024),
  ('Once', 27548),
  ('day', 27259),
  ('to', 26263),
  ('time', 26059),
  ('upon', 25289),
  ('One', 24251),
  ('named', 20154)],
 'दिन': [('day', 26114),
  ('.', 25459),
  (',', 21151),
  ('One', 14357),
  ('the', 13232),
  ('a', 11731),
  ('to', 10778),
  ('and', 10086),
  ('\n', 9259),
  ('that', 5155),
  ('was', 5117),
  ('her', 4894),
  ('on', 4883),
  ('in', 4513),
  ('she', 3976)],
 ',': [(',', 157273),
  ('.', 130806),
  ('"', 67200),
  ('\n', 61883),
  ('and', 61293),
  ('the', 54840),
  ('a', 50231),
  ('to', 45014),
  ('said', 42559),
  ('was', 35941),
  ('!', 22566),
  ('day', 21774),
  ('it', 20326),
  ('Lily', 19519),
  ('you', 18070)],
 'नोसी': [('.', 8),
  ('Nosy', 7),
  (',', 5),
  ('bird', 4),
  ('a', 3),
  ('cat', 3),
  ('\n', 3),
  ('the', 3),
  ('was', 2),
  ('nosy', 2),
  ('big', 2),
  ('she', 2),
  ('saw', 2),
  ('to',

In [78]:
eng_words_idf=Counter()
for k in truncated_dict:
    eng_words_idf.update([x[0] for x in truncated_dict[k]])

In [79]:
len(eng_words_idf)

15375

In [80]:
len(truncated_dict)

22075

In [88]:
hin_stem_words=Counter()
for k in truncated_dict:
    k_=generate_stem_words(k)
    hin_stem_words.update([k_])

In [89]:
len(hin_stem_words)

15422

In [90]:
hin_stem_words

Counter({'एक': 1,
         'दिन': 2,
         ',': 1,
         'नोस': 1,
         'नाम': 2,
         'की': 1,
         'छोट': 4,
         'बिल्ल': 3,
         'टहलन': 2,
         'निकल': 8,
         '।': 1,
         'बड़': 4,
         'चौड़': 4,
         'आँख': 3,
         'वाल': 4,
         'नासमझ': 2,
         'थी': 1,
         'उस': 3,
         'यह': 5,
         'देखन': 3,
         'पसंद': 1,
         'था': 1,
         'कि': 2,
         'हर': 5,
         'कोई': 1,
         'क्य': 2,
         'कर': 10,
         'रह': 9,
         'है': 1,
         'जैस': 4,
         'ही': 1,
         'चल': 14,
         'उसन': 1,
         'पेड़': 2,
         'देख': 9,
         'पर': 3,
         'पक्ष': 3,
         'रहत': 4,
         'बहुत': 2,
         'सुंदर': 1,
         '.': 1,
         'को': 1,
         'नमस्त': 1,
         'कहन': 3,
         'चाहत': 5,
         'वह': 5,
         'के': 2,
         'करीब': 2,
         'गय': 4,
         'और': 2,
         'ऊपर': 2,
         'ने': 1,
         'डर': 10,

In [49]:
most_common_words=Counter()
for k in truncated_dict:
    most_common_words.update([x[0] for x in truncated_dict[k]])
mcw=[x[0] for x in most_common_words.most_common(20)]+["there"]

filtered_dict={}
for k in truncated_dict:
    filtered_dict[k]=[x for x in truncated_dict[k] if x[0] not in mcw]


final_mapping={}
for k in truncated_dict:
    final_mapping[k]=filtered_dict[k][0][0] if filtered_dict[k] else ""

In [68]:
truncated_dict['कि']

[('.', 46355),
 ('and', 21251),
 ('to', 20905),
 ('was', 19854),
 ('the', 19677),
 (',', 18181),
 ('that', 17497),
 ('it', 12830),
 ('he', 10775),
 ('\n', 10672),
 ('a', 9867),
 ('she', 8762),
 ('She', 7739),
 ('her', 7563),
 ('He', 7344)]

In [66]:
final_mapping


{'एक': 'Once',
 'दिन': 'day',
 ',': 'day',
 'नोसी': 'Nosy',
 'नाम': 'named',
 'की': 'time',
 'छोटी': 'little',
 'बिल्ली': 'cat',
 'टहलने': 'walk',
 'निकली': 'out',
 '।': '',
 'बड़ी': 'big',
 'चौड़ी': 'wide',
 'आँखों': 'eyes',
 'वाली': 'with',
 'नासमझ': 'nosy',
 'थी': 'she',
 'उसे': 'he',
 'यह': 'It',
 'देखना': 'see',
 'पसंद': 'loved',
 'था': 'he',
 'कि': 'he',
 'हर': 'day',
 'कोई': 'no',
 'क्या': '?',
 'कर': 'could',
 'रहा': 'he',
 'है': 'is',
 'जैसे': 'As',
 'ही': 'soon',
 'चली': 'went',
 'उसने': 'she',
 'बड़ा': 'big',
 'पेड़': 'tree',
 'देखा': 'saw',
 'पर': 'on',
 'पक्षी': 'bird',
 'रहता': 'lived',
 'बहुत': 'very',
 'सुंदर': 'beautiful',
 '.': 'not',
 'को': 'his',
 'नमस्ते': 'hello',
 'कहना': 'say',
 'चाहती': 'wanted',
 'वह': 'he',
 'के': 'with',
 'करीब': 'closer',
 'गयी': 'went',
 'और': '',
 'ऊपर': 'up',
 'ने': 'Lily',
 'डर': 'scared',
 'गया': 'he',
 'डराना': 'scare',
 'नहीं': "n't",
 'तो': 'When',
 'बिल्कुल': 'just',
 'शांत': 'quiet',
 'खड़ी': 'standing',
 'रही': 'she',
 'फुसफुसाई'

In [50]:
len(final_mapping)

22075

In [51]:
filtered_dict

{'एक': [('Once', 27548),
  ('day', 27259),
  ('time', 26059),
  ('upon', 25289),
  ('One', 24251),
  ('named', 20154)],
 'दिन': [('day', 26114), ('One', 14357), ('on', 4883), ('she', 3976)],
 ',': [('day', 21774), ('Lily', 19519), ('you', 18070)],
 'नोसी': [('Nosy', 7),
  ('bird', 4),
  ('cat', 3),
  ('nosy', 2),
  ('big', 2),
  ('she', 2),
  ('saw', 2)],
 'नाम': [('named', 10556),
  ('Once', 9192),
  ('time', 8656),
  ('upon', 8646),
  ('little', 7184),
  ('girl', 5307),
  ('Lily', 3497),
  ('boy', 2407)],
 'की': [('time', 18299),
  ('Once', 18005),
  ('upon', 17651),
  ('named', 13890),
  ('little', 13117),
  ('Lily', 11649)],
 'छोटी': [('little', 9399),
  ('girl', 7900),
  ('Once', 4969),
  ('time', 4737),
  ('upon', 4683),
  ('named', 4574),
  ('Lily', 3280)],
 'बिल्ली': [('cat', 2209), ('named', 325), ('kitten', 266), ('day', 249)],
 'टहलने': [('walk', 475),
  ('for', 442),
  ('day', 344),
  ('One', 306),
  ('went', 291),
  ('park', 126),
  ('go', 125)],
 'निकली': [('out', 176),
 

In [54]:
len(set([x[0] for x in final_mapping.items()]))

22075

In [56]:
mapc=Counter()
mapc.update([x[1] for x in final_mapping.items()])

In [63]:
count=0
for x in mapc.keys():
    if mapc[x]==2:
        print(x)


tree
standing
plan
now
corner
puppy
food
hugged
mule
idea
Bella
bug
invited
adventure
delicate
someone
matter
kicked
surrounded
place
shouts
since
himself
magical
directions
rainbow
improve
asks
nodded
pedal
practice
suggested
mistakes
confident
responsible
respectful
promised
Squeaky
Blue
Hootie
speak
sick
days
situations
grandma
ready
flour
brilliant
accidentally
dropped
middle
argue
apologized
candles
backyard
agreement
clothes
snack
climbed
twist
truck
our
Sara
sandwich
cookies
chew
flies
polite
creature
There
ma'am
plate
smart
brother
still
favorite
Halloween
ears
contest
pie
independent
these
oranges
lunch
castle
minutes
boring
reached
guilty
forgive
barber
orange
east
compass
Benny
chipmunk
bull
butterfly
scolds
careless
pretend
drawers
ruler
joke
excitement
chose
offer
real
takes
oven
curious
finger
blood
sink
band
dangerous
warned
smelly
drops
finally
forest
pond
wine
hope
soldier
exclaimed
Stop
twice
quiz
stage
river
rocks
awe
view
journal
parts
stunned
lid
running
scolded
a

In [62]:
count

5536

In [9]:
class Kun:
    def __init__(self):
        self.maps={}
    
    def update(self,k,v):
        if k not in self.maps:
            self.maps[k]=Counter()
        self.maps[k].update(v)

def nearest_neighbors(values, all_values, nbr_neighbors=10):
    from sklearn.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=nbr_neighbors, 
                          metric='cosine', 
                          algorithm='brute').fit(all_values)
    dists, idxs = nn.kneighbors(values, n_neighbors=nbr_neighbors)
    return zip(*dists, *idxs)

In [10]:
from map_hi_to_en import map_hindi_to_english

  from .autonotebook import tqdm as notebook_tqdm


{
    "एक": "One",
    "दिन": "day",
    "नोसी": "Nosy",
    "नाम": "named",
    "छोटी": "little",
    "बिल्ली": "cat",
    "टहलने": "Nosy",
    "निकली": "Nosy"
}


In [11]:
mapping_dict=Kun()
consistent_ids=[]
for idx in tqdm(hin_stories):
    hi_story=hin_stories[idx]
    en_story=eng_stories[idx]
    
    en_sentences=[x.text for x in list(nlp_en(en_story).sents)]
    hi_sentences=[x.text for x in list(nlp_hi(hi_story).sents)]

    if len(en_sentences)==len(hi_sentences):
        consistent_ids.append(idx)
        for e,h in zip(en_sentences,hi_sentences):
            etokens=[x.text for x in tok_en(e)]
            htokens=[x.text for x in tok_hi(h)]
            print(etokens)
            print(htokens)
            mapping = map_hindi_to_english(htokens, etokens)
            print(mapping)
            # for t_ in htokens:
            #     mapping_dict.update(t_,etokens)
        
    break

  0%|          | 0/19999 [00:00<?, ?it/s]

['One', 'day', ',', 'a', 'little', 'cat', 'named', 'Nosy', 'went', 'for', 'a', 'walk', '.']
['एक', 'दिन', ',', 'नोसी', 'नाम', 'की', 'एक', 'छोटी', 'बिल्ली', 'टहलने', 'निकली', '।']
{'एक': 'One', 'दिन': 'day', ',': ',', 'नोसी': 'Nosy', 'नाम': 'named', 'की': ',', 'छोटी': 'little', 'बिल्ली': 'cat', 'टहलने': ',', 'निकली': '.', '।': '.'}
['Nosy', 'was', 'a', 'nosy', 'cat', 'with', 'big', ',', 'wide', 'eyes', '.']
['नोसी', 'बड़ी', ',', 'चौड़ी', 'आँखों', 'वाली', 'एक', 'नासमझ', 'बिल्ली', 'थी', '।']
{'नोसी': 'Nosy', 'बड़ी': 'big', ',': ',', 'चौड़ी': 'wide', 'आँखों': 'eyes', 'वाली': ',', 'एक': 'a', 'नासमझ': '.', 'बिल्ली': 'cat', 'थी': ',', '।': '.'}
['She', 'liked', 'to', 'see', 'what', 'everyone', 'was', 'doing', '.']
['उसे', 'यह', 'देखना', 'पसंद', 'था', 'कि', 'हर', 'कोई', 'क्या', 'कर', 'रहा', 'है', '।']
{'उसे': '.', 'यह': 'to', 'देखना': 'see', 'पसंद': '.', 'था': '.', 'कि': '.', 'हर': '.', 'कोई': '.', 'क्या': 'what', 'कर': '.', 'रहा': '.', 'है': '.', '।': '.'}
['\n', 'As', 'Nosy', 'walked', ',', 

  0%|          | 0/19999 [00:04<?, ?it/s]

{'वे': 'They', 'दोस्त': '.', 'बन': '.', 'गए': '.', 'और': 'and', 'पूरे': '.', 'दिन': 'day', 'खेलते': '.', 'रहे': '.', '।': '.'}





In [16]:
def generate_stem_words(word):
    suffixes = {
        1: [u"ो",u"े",u"ू",u"ु",u"ी",u"ि",u"ा"],
        2: [u"कर",u"ाओ",u"िए",u"ाई",u"ाए",u"ने",u"नी",u"ना",u"ते",u"ीं",u"ती",u"ता",u"ाँ",u"ां",u"ों",u"ें"],
        3: [u"ाकर",u"ाइए",u"ाईं",u"ाया",u"ेगी",u"ेगा",u"ोगी",u"ोगे",u"ाने",u"ाना",u"ाते",u"ाती",u"ाता",u"तीं",u"ाओं",u"ाएं",u"ुओं",u"ुएं",u"ुआं"],
        4: [u"ाएगी",u"ाएगा",u"ाओगी",u"ाओगे",u"एंगी",u"ेंगी",u"एंगे",u"ेंगे",u"ूंगी",u"ूंगा",u"ातीं",u"नाओं",u"नाएं",u"ताओं",u"ताएं",u"ियाँ",u"ियों",u"ियां"],
        5: [u"ाएंगी",u"ाएंगे",u"ाऊंगी",u"ाऊंगा",u"ाइयाँ",u"ाइयों",u"ाइयां"],
    }
    
    for L in suffixes:
        if len(word) > L + 1:
            for suf in suffixes[L]:
                #print type(suf),type(word),word,suf
                if word.endswith(suf):
                    #print 'h'
                    return word[:-L]
    return word

In [108]:
vocab_en.most_common(100)

[('.', 70526628),
 (',', 41832652),
 ('and', 33148110),
 ('the', 33124873),
 ('to', 27632353),
 ('a', 26766696),
 ('"', 19851176),
 ('was', 19674419),
 ('\n', 15305411),
 ('her', 10118830),
 ('it', 9717552),
 ('Lily', 9577758),
 ('The', 8398327),
 ('day', 7849312),
 ('said', 7463332),
 ('with', 7412651),
 ('She', 7311004),
 ('They', 6971119),
 ('his', 6700814),
 ('He', 6669465),
 ('in', 6507626),
 ('that', 6213623),
 ('!', 6098233),
 ('he', 6076177),
 ('she', 5655245),
 ("'s", 5552659),
 ('they', 5337118),
 ('on', 5099382),
 ('big', 4985665),
 ('little', 4904007),
 ('time', 4800325),
 ('had', 4741506),
 ('there', 4590471),
 ('One', 4548474),
 ('of', 4452867),
 ('Tim', 4385316),
 ('you', 4370274),
 ('mom', 4348641),
 ('play', 4344985),
 ('I', 4330125),
 ('happy', 4298420),
 ('saw', 4276044),
 ('named', 4221400),
 ('for', 4028325),
 ('very', 3964061),
 ('Timmy', 3955630),
 ('Once', 3944082),
 ('upon', 3854053),
 ('so', 3698552),
 ('but', 3593136),
 ("n't", 3560840),
 ('not', 3242028),
 (

In [107]:
vocab_hi.most_common(100)

[('।', 310375),
 ('और', 171339),
 (',', 170241),
 ('"', 117187),
 ('एक', 116990),
 ('है', 96185),
 ('था', 81564),
 ('के', 75764),
 ('वह', 74544),
 ('की', 67993),
 ('ने', 66057),
 ('को', 63352),
 ('में', 58598),
 ('से', 56461),
 ('कि', 51520),
 ('थी', 49150),
 ('उसने', 46814),
 ('बहुत', 42284),
 ('हैं', 42250),
 ('नहीं', 39895),
 ('उसे', 38879),
 ('कहा', 37616),
 ('.', 36154),
 ('का', 35845),
 ('वे', 35252),
 ('!', 34577),
 ('यह', 34001),
 ('लिली', 31893),
 ('माँ', 31220),
 ('पर', 30795),
 ('लिए', 29599),
 ('अपने', 27511),
 ('दिन', 27223),
 ('लेकिन', 25920),
 ('थे', 23163),
 ('हो', 22406),
 ('साथ', 22048),
 ('गया', 21713),
 ('उन्होंने', 20940),
 ('अपनी', 20904),
 ('कर', 20564),
 ('देखा', 19492),
 ('-', 19291),
 ('पसंद', 18397),
 ('दिया', 18356),
 ('उन्हें', 17698),
 ('क्या', 17662),
 ('उसकी', 17222),
 ('बात', 17140),
 ('कुछ', 17060),
 ('भी', 15902),
 ('किया', 15750),
 ('रहा', 14803),
 ('?', 14765),
 ('उसके', 14757),
 ('बार', 14570),
 ('हुआ', 14249),
 ('नाम', 13750),
 ('मैं', 13606),
 ('

In [2]:
import os
import openai
OAI_KEY="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"
openai.api_key = os.getenv(OAI_KEY)

In [3]:
import os
os.environ["OPENAI_API_KEY"]=OAI_KEY


In [6]:
openai.Model.list(api_key=OAI_KEY)

<OpenAIObject list at 0x7fe0659866d0> JSON: {
  "object": "list",
  "data": [
    {
      "id": "text-search-babbage-doc-001",
      "object": "model",
      "created": 1651172509,
      "owned_by": "openai-dev",
      "permission": [
        {
          "id": "modelperm-s9n5HnzbtVn7kNc5TIZWiCFS",
          "object": "model_permission",
          "created": 1695933794,
          "allow_create_engine": false,
          "allow_sampling": true,
          "allow_logprobs": true,
          "allow_search_indices": true,
          "allow_view": true,
          "allow_fine_tuning": false,
          "organization": "*",
          "group": null,
          "is_blocking": false
        }
      ],
      "root": "text-search-babbage-doc-001",
      "parent": null
    },
    {
      "id": "curie-search-query",
      "object": "model",
      "created": 1651172509,
      "owned_by": "openai-dev",
      "permission": [
        {
          "id": "modelperm-8aqdyZaKtD3MD831mGbqh1MD",
          "object": "

In [109]:
count=0
for h in vocab_hi:
    if generate_stem_words(h)==h:
        count+=1

In [27]:
count

10226

In [15]:
import tiktoken

In [16]:
# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [75]:
PROMPT="""
Map the Hindi words to the corresponding English words (having the same meaning or attributes) from the given lists of words for English and Hindi:
English List: ['One', 'day', ',', 'a', 'little', 'cat', 'named', 'Nosy', 'went', 'for', 'a', 'walk', '.']
Hindi List: ['एक', 'दिन', ',', 'नोसी', 'नाम', 'की', 'एक', 'छोटी', 'बिल्ली', 'टहलने', 'निकली', '।']
Output should be in the form of a json where the key is the Hindi word and the value is the corresponding English word. 
Wherever unsure return null value
Give only a valid json as output
"""

PROMPT="""
Pick a  synonym  for   "," in English from the following list only:
['One', 'day', ',', 'a', 'little', 'cat', 'named', 'Nosy', 'went', 'for', 'a', 'walk', '.']
"""

PROMPT="""Pick a  synonym  for the Hindi word 'बिल्ली' in English from choices below:
1. 'One'
2. 'day' 
3. ','
4. 'a'
5. 'little'
6. 'tiger'
7. 'named'
8.'Nosy'
9. 'went'
10. 'for'
11.'a'
12. 'walk' 
"""

In [83]:
import openai
openai.api_key = OAI_KEY

context = "You are chatting with a customer service representative."
message = "Hi, I have a problem with my account."
response = openai.ChatCompletion.create(
  engine="gpt-3.5-turbo",
  prompt=f"Chat:\n{context}\nUser: {message}\n",
  max_tokens=50
)

reply = response.choices[0].text.strip()
print(reply)

InvalidRequestError: Invalid URL (POST /v1/engines/gpt-3.5-turbo/chat/completions)

In [86]:
messages=[
        {"role": "user", "content": PROMPT}
]

In [87]:
import openai
completion = openai.ChatCompletion.create(model="gpt-3.5-turbo-0613", messages=messages,api_key=OAI_KEY,max_tokens=10)

In [90]:
completion['usage']

<OpenAIObject at 0x7fe06b1b3770> JSON: {
  "prompt_tokens": 92,
  "completion_tokens": 6,
  "total_tokens": 98
}

In [95]:
completion['choices'][0]['message']['content']

"6. 'tiger'"

In [17]:
len(enc.encode(PROMPT))

54

In [14]:
from openai.embeddings_utils import get_embedding

In [18]:
import openai
completion = openai.Embedding.create(model="text-embedding-ada-002", input="Once",api_key=OAI_KEY)

In [39]:
completion['usage']['total_tokens']

1

In [65]:
from sentence_transformers import  util
def find_mapping(hindi_list,hindi_embeddings,english_list,english_embeddings):
    mapping = {}
    for h_word, h_embedding in zip(hindi_list, hindi_embeddings):
            max_similarity = float('-inf')
            mapped_word = None
            for e_word, e_embedding in zip(english_list, english_embeddings):
                similarity = util.pytorch_cos_sim(h_embedding, e_embedding).item()
                if similarity > max_similarity:
                    max_similarity = similarity
                    mapped_word = e_word
            mapping[h_word] = mapped_word

    return mapping

In [66]:
emb_hi=pickle.load(open("./models/emb_hi.bin","rb"))
emb_en=pickle.load(open("./models/emb_en.bin","rb"))

In [67]:
import torch
hindi_list=[]
hindi_embeddings=[]
for x in emb_hi:
    hindi_list.append(x)
    hindi_embeddings.append(torch.Tensor(emb_hi[x]['data'][0]['embedding']))

english_list=[]
english_embeddings=[]

for x in emb_en:
    english_list.append(x)
    english_embeddings.append(torch.Tensor(emb_en[x]['data'][0]['embedding']))
find_mapping(hindi_list,hindi_embeddings,english_list,english_embeddings)

{'एक': 'Once',
 'दिन': 'time',
 ',': ',',
 'नोसी': '.',
 'नाम': 'named',
 'की': 'upon',
 'छोटी': 'little',
 'बिल्ली': 'girl',
 'टहलने': '.',
 'निकली': 'named',
 '।': '.',
 'बड़ी': 'little',
 'चौड़ी': 'upon',
 'आँखों': 'upon',
 'वाली': 'She',
 'नासमझ': 'named',
 'थी': 'She',
 'उसे': 'She',
 'यह': 'a',
 'देखना': 'explore',
 'पसंद': 'upon'}

In [58]:
emb_en["Once"]['data'][0]['embedding']

[0.005845752544701099,
 -0.013331946916878223,
 0.0102393114939332,
 -0.004645675886422396,
 -0.0013530278811231256,
 0.0012395752128213644,
 -0.01992732658982277,
 -0.012652911245822906,
 0.017090171575546265,
 0.007005490828305483,
 0.03041539341211319,
 0.006511341780424118,
 0.03434169664978981,
 -0.0068407743237912655,
 0.004410366527736187,
 0.006551680155098438,
 0.044372592121362686,
 0.018219655379652977,
 0.009721631184220314,
 -0.005092763341963291,
 -0.029312802478671074,
 0.028990093618631363,
 -0.02222663350403309,
 -0.018219655379652977,
 -0.01704983226954937,
 -0.0038389014080166817,
 -0.004578444641083479,
 -0.020371053367853165,
 -0.005203694570809603,
 -0.006585295777767897,
 0.021608106791973114,
 -0.0409572459757328,
 -0.0016253142384812236,
 2.936112468887586e-05,
 -0.03420723229646683,
 -0.01606825739145279,
 -0.0045952522195875645,
 -0.00975524727255106,
 -0.01762801967561245,
 -0.015301820822060108,
 0.019160890951752663,
 0.004339773673564196,
 0.0054087499156