# Data Processing for Choi et al. (2022)'s Soft Constraint Method

We first filter the clean glossary - which already only contains one-to-one translations (saving us one preprocessing step) - to remove terms occurring at high frequency within a general-domain corpus. We use our Lexique data for this. After that, we will sample 15% of our training data and save the other 85% to Huggingface. We will then use Spacy and SimAlign to tokenise and word-align the sampled data, but that will be for the HPC.

In [2]:
from datasets import load_dataset, Dataset
import pandas as pd

#Converts data in src [TAB] tgt [NEWLINE] format to a format suitable for model training
def convertToDictFormat(data):
    source = []
    target = []
    for example in data:
        example = example.strip()
        sentences = example.split("\t")
        source.append(sentences[0])
        target.append(sentences[1])
    ready = Dataset.from_dict({"en":source, "fr":target})
    return ready

In [20]:
#Load in clean glossary and convert to Dataset object
term_candidates = load_dataset("ethansimrm/MeSpEn_enfr_cleaned_glossary", split = "train")
terms_ready = convertToDictFormat(term_candidates['text'])

Found cached dataset text (C:/Users/ethan/.cache/huggingface/datasets/ethansimrm___text/ethansimrm--MeSpEn_enfr_cleaned_glossary-e1fb9a6b67efd03c/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


In [5]:
most_common_fr = pd.read_excel("Lexique_FR_PoS.xlsx")

In [26]:
#Check the frequency of each word in Lexique (this is by no means perfect; Choi et al. had a massive OOD corpus to check against). 
def word_frequency(row):
    query = row["fr"].lower() #All our Lexique words are in lowercase
    ans = most_common_fr[most_common_fr["Word"] == query] #Search for word or lemma match
    if (ans.empty):
        ans = most_common_fr[most_common_fr["lemme"] == query]
        if (ans.empty):
            return row
    if (len(ans) == 1):
        row["general_freq"] = ans["freqfilms2"].values[0]
    else:
        row["general_freq"] = ans["freqfilms2"].max() 
        #If we have more than one match, assume the most frequent form (due to plurals, etc.) - we are looking for the prevalence of this concept in the general domain
    return row

In [34]:
general_freq = [-1] * len(terms_ready) #Create a new placeholder column
terms_ready_new = terms_ready.add_column("general_freq", general_freq)
terms_ready_with_freq = terms_ready_new.map(word_frequency)

Map:   0%|          | 0/5084 [00:00<?, ? examples/s]

In [54]:
#There are 39 terms which occur with frequency > 20 per 1,000,000 words of our French film subtititles corpus. Choi et al. did not specify what "high frequency" meant, so I use 
#10 per 1,000,000 here as a first pass. All terms here seem relatively general in nature; going down to 10 per 1,000,000 incorporates translations of more medical terms like "bunion", 
#which is "oignon" (also the translation of "onion") in French. The implicit assumption that general translation is equivalent to the medical translation may not hold in that case - 
#the baseline model might only emit an "oignon" if it is provided with "onion", rather than "bunion".
total = 0
for term in terms_ready_with_freq:
    if (term["general_freq"] > 20):
        total+=1
        print(term)
print(total)

{'en': 'breast', 'fr': 'sein', 'general_freq': 27.97}
{'en': 'Bruise', 'fr': 'Bleu', 'general_freq': 21.63}
{'en': 'buttock', 'fr': 'fesse', 'general_freq': 34.52}
{'en': 'child', 'fr': 'enfant', 'general_freq': 448.33}
{'en': 'color', 'fr': 'couleur', 'general_freq': 24.79}
{'en': 'consideration', 'fr': 'fait', 'general_freq': 27.36}
{'en': 'data', 'fr': 'données', 'general_freq': 20.05}
{'en': 'death', 'fr': 'mort', 'general_freq': 78.09}
{'en': 'drugs', 'fr': 'médicaments', 'general_freq': 29.8}
{'en': 'ear', 'fr': 'oreille', 'general_freq': 39.08}
{'en': 'Error', 'fr': 'Erreur', 'general_freq': 22.82}
{'en': 'Family', 'fr': 'Famille', 'general_freq': 27.16}
{'en': 'fasting', 'fr': 'jeune', 'general_freq': 62.12}
{'en': 'film', 'fr': 'film', 'general_freq': 57.56}
{'en': 'finger', 'fr': 'doigt', 'general_freq': 45.86}
{'en': 'Finger', 'fr': 'Doigt', 'general_freq': 45.86}
{'en': 'hair', 'fr': 'cheveu', 'general_freq': 116.16}
{'en': 'Hazard', 'fr': 'Risque', 'general_freq': 23.36}
{

In [55]:
terms_ready_filtered = terms_ready_with_freq.filter(lambda x: x["general_freq"] <= 20)

Filter:   0%|          | 0/5084 [00:00<?, ? examples/s]

In [56]:
len(terms_ready_filtered) #Initially, we had 5084 terms; we now have 5045.

5045

In [66]:
f = open("filtered_cleaned_glossary_choi.txt", "w", encoding = "utf8")
for term in terms_ready_filtered:
    f.write(term['en'] + '\t' + term['fr'] + '\n')
f.close()

In [60]:
#Now, load in our training data for splitting
train_data = load_dataset("ethansimrm/wmt_16_19_22_biomed_train_processed", split = "train") 

Found cached dataset text (C:/Users/ethan/.cache/huggingface/datasets/ethansimrm___text/ethansimrm--wmt_16_19_22_biomed_train_processed-8662b34233d7661e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


In [63]:
#Seed = 42 for reproducibility. Now, we will store these splits separately in HF. Happily, they are already in our SRC [TAB] TGT [NEWLINE] format.
train_data_split = train_data.train_test_split(train_size = 0.15, seed = 42)
data_for_choi = train_data_split["train"]
unchanged_data = train_data_split["test"]

Loading cached split indices for dataset at C:\Users\ethan\.cache\huggingface\datasets\ethansimrm___text\ethansimrm--wmt_16_19_22_biomed_train_processed-8662b34233d7661e\0.0.0\cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2\cache-5f4731a6d62e6ea7.arrow and C:\Users\ethan\.cache\huggingface\datasets\ethansimrm___text\ethansimrm--wmt_16_19_22_biomed_train_processed-8662b34233d7661e\0.0.0\cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2\cache-cf343edd631ba6c0.arrow


In [68]:
f = open("train_data_to_modify_choi.txt", "w", encoding = "utf8")
for line in data_for_choi['text']:
    f.write(line + '\n')
f.close()

In [69]:
f = open("train_data_leave_unchanged_choi.txt", "w", encoding = "utf8")
for line in unchanged_data['text']:
    f.write(line + '\n')
f.close()