In [1]:
!pip install transformers==4.28.0 sentencepiece sacremoses datasets 

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=7c951d65011b107e663469ac633723486fbb0b03a5c36a91090354f0811a00ab
  Stored in directory: /root/.cache/pip/wheels/00/24/97/a2ea5324f36bc626e1ea0267f33db6aa80d157ee977e9e42fb
Successfully built sacremoses
Installing collected packages: sacremoses, transformers
  Attempting uninstall: transformers
    Fo

In [2]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
secret_value = user_secrets.get_secret("HF_KEY")
login(secret_value)
secret_value1 = user_secrets.get_secret("WANDB_KEY")
wandb.login(key=secret_value1)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
from datasets import load_dataset
training_data = load_dataset("ethansimrm/wmt_16_19_22_biomed_train", split = "train", use_auth_token=True) 

Downloading and preparing dataset text/ethansimrm--wmt_16_19_22_biomed_train to /root/.cache/huggingface/datasets/text/ethansimrm--wmt_16_19_22_biomed_train-3dd4d6328c3dbab1/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/48.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/ethansimrm--wmt_16_19_22_biomed_train-3dd4d6328c3dbab1/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


In [4]:
#len(training_data) #741300 sentence pairs prior to pre-processing
#training_data['text'][1:10] #Adopting the SOURCE \t TARGET \n format is convenient.

In [5]:
from datasets import Dataset
source = []
target = []
for example in training_data['text']:
    example = example.strip()
    sentences = example.split("\t")
    source.append(sentences[0])
    target.append(sentences[1])
ready_data = Dataset.from_dict({"en":source, "fr":target})
#This dataset can now be used for training or validation per early experiments

In [6]:
#Seems alright.
ready_data[0]

{'en': 'Global Health: Where Do Physiotherapy and Rehabilitation Research Fit?',
 'fr': 'La place des cheveux et des poils dans les rituels et le sacré.'}

In [7]:
import pandas as pd
#Let's start preprocessing according to Wu et al.(2022).
#First, remove duplicate sentences. 
#We can't do this in datasets, so we must convert it to a pandas DataFrame.
tempDataset = pd.DataFrame(ready_data)

In [8]:
#tempDataset[tempDataset.duplicated()] 
#Wow, there are a lot of duplicate rows, 39540 in total.

In [9]:
tempDataset_dedup = tempDataset.drop_duplicates()

In [10]:
#Declaring helper functions separately
import langid, string
from tqdm import tqdm
langid.set_languages(["en", "fr"]) #Constrain language set
tqdm.pandas() #So I can see progress
count = lambda l1,l2: sum([1 for x in l1 if x in l2])

In [11]:
#Having verified that we can convert this back into a Dataset object, we'll just 
#do everything else in a pandas DataFrame. There are seven per-sentence checks
#we can do.

def retain_sentence(row): #This defines a good sentence we wish to retain
    word_counts = []
    #Exclude repeats and empty source or target sentences
    if ((row["en"] == row["fr"]) or (row["en"] == "") or (row["fr"] == "")):
        return False
    for sentence in (row["en"], row["fr"]):
        #Exclude sentences with mismatched () or '' or ""
        if ((sentence.count("(") != sentence.count(")")) or 
            (sentence.count("'") % 2 != 0) or
            (sentence.count('"') % 2 != 0)):
            return False
        sent_length = len(sentence)
        #Exclude sentences with punctuation percentage > 0.4
        if (count(sentence,set(string.punctuation)) > 0.4 * sent_length):
            return False
        #Exclude sentences with > 150 words
        num_words = len(sentence.split(" "))
        if (num_words > 150):
            return False
        word_counts.append(num_words)
        #Exclude sentences with char-to-word ratio > 12 or < 1.5
        c2w_ratio = sent_length / num_words
        if ((c2w_ratio > 12) or (c2w_ratio < 1.5)):
            return False
    #Heuristic "alignment" filtering
    word_ratio = word_counts[0] / word_counts[1]
    if((word_ratio >= 9) or (1/word_ratio >= 9)):
        return False
    #Expensive language-determining step
    if ((langid.classify(row['en'])[0] != "en") or 
    (langid.classify(row['fr'])[0] != "fr")):
        return False
    return True
"""
We will skip full alignment-checking, due to infeasibility and time constraints.
SOTA-used options such as fast-align and eflomal require a large parallel training 
corpus, which we lack. AWESOME-align cannot run, even from the command line, 
due to an issue with the multiprocessing module and pickling. We can take another
crack at this in the future.

However, we can filter out obviously bad alignments by looking at word ratio;
we filter out word ratios greater than 9 or smaller than 1/9.
This (heuristic) method comes from Del et al. (2021).
"""
print()




In [12]:
tempDataset_nearlyAligned = tempDataset_dedup.progress_apply(retain_sentence, axis=1)

100%|██████████| 701760/701760 [02:44<00:00, 4253.13it/s]


In [13]:
filtered_set = tempDataset_dedup[tempDataset_nearlyAligned]

In [14]:
filtered_set = filtered_set.reset_index(drop=True)

In [15]:
filtered_set

Unnamed: 0,en,fr
0,Global Health: Where Do Physiotherapy and Reha...,La place des cheveux et des poils dans les rit...
1,Carabin.,Les Carabins.
2,"Anti-aging medicine, a science-based, essentia...","La médecine anti-âge, une médecine scientifiqu..."
3,"Doping, sport and addiction--any links?","Dopage, pratique sportive et addiction--quels ..."
4,Fibromatosis colli or congenital torticollis: ...,Le fibromatosis colli ou torticolis congénital...
...,...,...
411662,Microbiota in affected pasterns appeared to ha...,Le microbiote des paturons affectés semble avo...
411663,Beta diversity analyses demonstrated that bact...,Les analyses de diversité bêta ont démontré qu...
411664,Meteorological factors also had considerable i...,Les facteurs météorologiques ont également eu ...
411665,Our study provides preliminary observations of...,Notre étude fournit des observations prélimina...


In [16]:
f = open("wmt_train_filtered.txt", "w", encoding = "utf-8")
source = filtered_set["en"]
target = filtered_set["fr"]
for i in range(len(filtered_set)):
    f.write(source[i] + "\t" + target[i] + "\n")
f.close()