In [1]:
import pickle
import numpy as np
# from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm import tqdm
import json,os
# import openai

In [2]:
def load_pkl_data(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

def load_json_data(path):
    with open(path, 'r') as fcc_file:
        fcc_data = json.load(fcc_file)
    return fcc_data

def filter_duplicated_translation(dataset_all_cs):
    filtered_dataset_all_cs = []
    
    for item in tqdm(dataset_all_cs):
        if item['Translation-0'] != item['Translation-1']:
            filtered_dataset_all_cs.append(item)
            
    return filtered_dataset_all_cs

# Part-1 process the MixMT-2022 training set

## Part-1.1 load the data

In [3]:
data_train = load_pkl_data('../MixMT-2022/train_human_generated.pkl')

## Part-1.2 check if sample only has one golden translation

In [4]:
new=[] # datapoints has 2 or more golden translation
error=[] # datapoints has only one golden translation
for item in tqdm(data_train):
    tmp={}

    if len(data_train[item])<2:
        error.append(item)
    else:
        text= item.strip('\n')
        tmp['English'] = item
        tmp['Gold_Hinglish'] = data_train[item]
        new.append(tmp)

100%|███████████████████████████████████| 1800/1800 [00:00<00:00, 401262.14it/s]


In [5]:
print("The number of datapoints which has 2 or more golden translations: {}".format(len(new)))
print("The number of datapoints which has only 1 golden translations: {}".format(len(error)))

The number of datapoints which has 2 or more golden translations: 1778
The number of datapoints which has only 1 golden translations: 22


In [6]:
new[0]

{'English': 'Program module is a file that contains instructions which are either in the form of source code or machine language.\n',
 'Gold_Hinglish': ['Program module ek sanchika hoti hai, jisme ya to source code ya machine language ke rup me anudesh nihit hote hai.\n',
  'Program module ek sanchika hoti hai, that contains instructions which are either in the form of source code or machine language.\n',
  'Program module is a file jisme source code or machine language ke rup me instructions nihit hote he.\n',
  'Program module ak file hoti he that contains instructions which are either in source code or machine language. \n']}

## Part-1.3 transform into the new format

In [7]:
dataset_MixMT=[]
for item in tqdm(new):
    num_gold=len(item['Gold_Hinglish'])
    for i in range(num_gold):
        for j in range(i+1,num_gold):
            tmp={}
            tmp['English'] = item['English'].strip('\n')
            tmp['Translation-0'] = item['Gold_Hinglish'][i].strip('\n')
            tmp['Translation-1'] = item['Gold_Hinglish'][j].strip('\n')
            dataset_MixMT.append(tmp)

100%|███████████████████████████████████| 1778/1778 [00:00<00:00, 171321.94it/s]


In [8]:
# delete the data which has same Translation-0 and Translation-1
filtered_dataset_MixMT = filter_duplicated_translation(dataset_MixMT)

100%|███████████████████████████████████| 3874/3874 [00:00<00:00, 737071.16it/s]


In [9]:
print("The number of datapoints in new preference set (after transformation from MixMT-2022): {}".format(len(dataset_MixMT)))
print("The number of datapoints in new preference set (after filtering duplicated translation): {}".format(len(filtered_dataset_MixMT)))

The number of datapoints in new preference set (after transformation from MixMT-2022): 3874
The number of datapoints in new preference set (after filtering duplicated translation): 3873


In [10]:
filtered_dataset_MixMT[1]

{'English': 'Program module is a file that contains instructions which are either in the form of source code or machine language.',
 'Translation-0': 'Program module ek sanchika hoti hai, jisme ya to source code ya machine language ke rup me anudesh nihit hote hai.',
 'Translation-1': 'Program module is a file jisme source code or machine language ke rup me instructions nihit hote he.'}

In [23]:
filtered_dataset_MixMT

[{'English': 'Program module is a file that contains instructions which are either in the form of source code or machine language.',
  'Translation-0': 'Program module ek sanchika hoti hai, jisme ya to source code ya machine language ke rup me anudesh nihit hote hai.',
  'Translation-1': 'Program module ek sanchika hoti hai, that contains instructions which are either in the form of source code or machine language.'},
 {'English': 'Program module is a file that contains instructions which are either in the form of source code or machine language.',
  'Translation-0': 'Program module ek sanchika hoti hai, jisme ya to source code ya machine language ke rup me anudesh nihit hote hai.',
  'Translation-1': 'Program module is a file jisme source code or machine language ke rup me instructions nihit hote he.'},
 {'English': 'Program module is a file that contains instructions which are either in the form of source code or machine language.',
  'Translation-0': 'Program module ek sanchika hoti

# Part-2 process the ALL-CS dataset

## Part-2.1 load the data and sumarize the statistic info

In [11]:
fcc_data = load_json_data(r'../All-CS/All-CS-With-Synthetic.json')

In [12]:
error_2=[]
for item in fcc_data:
    if item['dataset'] == 'moviecs':
        num_gold = len(item['mturk'])
        if num_gold <2:
            error_2.append(item)

In [13]:
res2=[]
for item in fcc_data:
    if item['dataset'] == 'moviecs':
        if 'gold' not in item.keys():
            res2.append(item)

In [14]:
res3=[]
for item in fcc_data:
    if item['dataset'] == 'moviecs':
        res3.append(item)

In [15]:
print("The number of datapoints in all-cs dataset: {}".format(len(fcc_data)))
print("The number of datapoints belonging to the movie part: {}".format(len(res3)))
print("In the movie part, the number of datapoints which have 0 gold code-mixed sentence: {}".format(len(res2)))
print("In the movie part,the number of datapoints which don't have 2(or more) Mturk code_mixed sentence: {}".format(len(error_2)))
# For movie part, there are 9290 data points in total
# For movie part, 939 datapoints don't have gold code-mixed sentence
# For movie part, 4872 dataponts don't have 2(or more) Mturk code_mixed sentence

The number of datapoints in all-cs dataset: 14582
The number of datapoints belonging to the movie part: 9290
In the movie part, the number of datapoints which have 0 gold code-mixed sentence: 939
In the movie part,the number of datapoints which don't have 2(or more) Mturk code_mixed sentence: 4872


## Part-2.2 preprocess the dataset (and only select datapoints containing at least two Mturk translation)

In [16]:
procssed_fcc_data = []
for item in tqdm(fcc_data):
    
    if item['dataset'] == 'moviecs':
        tmp={}
        tmp['English'] = item['eng_google'].replace('/NE/','')
        tmp['Gold_Hinglish'] = []
        if 'gold' in item.keys():
            proc_sent = item['gold'].replace('/NE/','')
            tmp['Gold_Hinglish'].append(proc_sent)
        for sent in item['mturk']:
            proc_sent_2 = sent.replace('/NE/','')
            tmp['Gold_Hinglish'].append(proc_sent_2)
        procssed_fcc_data.append(tmp)
#         if len(tmp['Gold_Hinglish'])>=2:
#             new.append(tmp)

100%|█████████████████████████████████| 14582/14582 [00:00<00:00, 230815.80it/s]


In [17]:
filtered_and_procssed_fcc_data=[]
for item in procssed_fcc_data:
    if len(item['Gold_Hinglish'])>=2:
        filtered_and_procssed_fcc_data.append(item)

## Part-2.3 transform into the new format

In [18]:
dataset_all_cs=[]
for item in tqdm(filtered_and_procssed_fcc_data):
    num_gold=len(item['Gold_Hinglish'])
    for i in range(num_gold):
        for j in range(i+1,num_gold):
            tmp={}
            tmp['English'] = item['English'].strip('\n')
            tmp['Translation-0'] = item['Gold_Hinglish'][i].strip('\n')
            tmp['Translation-1'] = item['Gold_Hinglish'][j].strip('\n')
            dataset_all_cs.append(tmp)

100%|███████████████████████████████████| 6716/6716 [00:00<00:00, 234569.20it/s]


In [19]:
# delete the data which has same Translation-0 and Translation-1
filtered_dataset_all_cs = filter_duplicated_translation(dataset_all_cs)

100%|█████████████████████████████████| 14923/14923 [00:00<00:00, 816078.63it/s]


In [20]:
print("The number of datapoints in new preference set (after transformation from all_cs): {}".format(len(dataset_all_cs)))
print("The number of datapoints in new preference set (after filtering duplicated translation): {}".format(len(filtered_dataset_all_cs)))

The number of datapoints in new preference set (after transformation from all_cs): 14923
The number of datapoints in new preference set (after filtering duplicated translation): 11317


In [21]:
filtered_dataset_all_cs[1]

{'English': "and this look is beautiful isn't it?",
 'Translation-0': 'और ये देखो beautiful है ना',
 'Translation-1': 'और ये look beautiful है ना'}

In [22]:
filtered_dataset_all_cs

[{'English': 'say anything priyanka gandhi is cool thing',
  'Translation-0': 'कुछ भी बोलो priyanka gandhi है मस्त item',
  'Translation-1': 'say anything priyanka gandhi है मस्त चीज'},
 {'English': "and this look is beautiful isn't it?",
  'Translation-0': 'और ये देखो beautiful है ना',
  'Translation-1': 'और ये look beautiful है ना'},
 {'English': 'apun was very intoxicated',
  'Translation-0': 'अपुन ने बहुत drink किया हुआ था',
  'Translation-1': 'मैने बहुत नशा किया हुआ था'},
 {'English': 'apun was very intoxicated',
  'Translation-0': 'अपुन ने बहुत drink किया हुआ था',
  'Translation-1': 'मैने बहुत नशा किया हुआ था'},
 {'English': 'all boys are ready',
  'Translation-0': 'सब लड़के तैयारी में है ages',
  'Translation-1': 'सब boys तैयारी में है'},
 {'English': 'the plateau nearby',
  'Translation-0': 'उधर ही पास में field',
  'Translation-1': 'उधर ही close में rock'},
 {'English': "you ain't serious",
  'Translation-0': 'आप जरा भी serious नहीं है',
  'Translation-1': 'आप जरा भी not seriou

# Part-3 merge two part dataset

In [42]:
dataset= filtered_dataset_MixMT+filtered_dataset_all_cs

In [51]:
print("The number of datapoints in new preference set: {}".format(len(dataset)))

The number of datapoints in new preference set: 15190


In [44]:
# take notice of encoding problem of Hindi,ensure_ascii=False
# reference: https://stackoverflow.com/questions/55459768/how-to-write-data-to-a-file-in-hindi-language
with open('preference_dataset_2024.json', 'w') as f:
    json.dump(dataset, f,indent=4,ensure_ascii=False)