## Install VS code environment 

In [None]:
!pip install colabcode

In [None]:
from colabcode import ColabCode

ColabCode(port=10000, password="per4288")

## Downloading and Extracting data sets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# get the crawled data from the smaller languages (yue)
!wget http://dumps.wikimedia.org/zh_yuewiki/20211020/zh_yuewiki-20211020-page.sql.gz
!wget http://dumps.wikimedia.org/zh_yuewiki/20211020/zh_yuewiki-20211020-langlinks.sql.gz

In [None]:
# download the dataset
!wget https://dumps.wikimedia.org/zh_yuewiki/20211020/zh_yuewiki-20211020-pages-articles.xml.bz2

In [None]:
!git clone https://github.com/clab/wikipedia-parallel-titles

In [None]:
#extract parallel titles
!/content/wikipedia-parallel-titles/build-corpus.sh zh zh_yuewiki-20211020 > titles.txt

In [None]:
#extracts and cleans text from a Wikipedia database backup dump
!pip install wikiextractor

In [None]:
!pip install gensim

### YUE

In [None]:
!python make_wiki_corpus.py zh_yuewiki-20211020-pages-articles.xml.bz2 wiki_yue.txt
# 15,410 articles

In [None]:
# see the word count of the wiki corpus of yue
!wc -c wiki_yue.txt | awk '{print $1}'

### ZH

In [None]:
# get the crawled data from the Chinese language
!wget https://dumps.wikimedia.org/zhwiki/20211020/zhwiki-20211020-pages-articles.xml.bz2

In [None]:
!python make_wiki_corpus.py zhwiki-20211020-pages-articles.xml.bz2 wiki_zh.txt

In [None]:
!tar -czvf zhwiki_corpus.tar.gz wiki_zh.txt

In [None]:
!wc -c wiki_zh.txt | awk '{print $1}'

## Data loading and prepossessing 

In [None]:
# unzip the data file of ZH
!tar -xvf /content/drive/MyDrive/rd_data/zhwiki_corpus.tar.gz

In [None]:
def read_txt(in_file):
    with open(in_file, 'r', encoding="utf-8") as f:
       lines = [line.rstrip() for line in f]
    return lines

def article_to_list_of_words(in_list):
    # convert a string like variable to a list of tokens (words)
    return list(in_list.split(" "))

def batch_load_lists_of_words(input_data):
    # convert all articles to a nested list, each list containing a list of words in the file.
    one_articles = [article_to_list_of_words(each) for each in input_data]
    return one_articles

def fetch_titles(inlist):
    # get all the titles in the wikipedia crawl
    titles = [article_to_list_of_words(each)[0] for each in inlist]
    return titles

In [None]:
!ls /content/drive/MyDrive/rd_data

In [None]:
# read the data files
datadir = "/content/drive/MyDrive/rd_data/"
zh_path = datadir+"wiki_zh.txt"
yue_path = datadir+"wiki_yue.txt"

zhwiki = read_txt("wiki_zh.txt")
yuewiki = read_txt(yue_path)

# load the files from lists of strings to lists of lists of words
yue_list = batch_load_lists_of_words(yuewiki)
zh_list = batch_load_lists_of_words(zhwiki)
print(yue_list[0])   #spotcheck
print(zh_list[0])  #spotcheck

# Get a list of the titles only in both lanuuages
yue_titles = fetch_titles(yuewiki)
zh_titles = fetch_titles(zhwiki)
print(yue_titles[0])   #spotcheck
print(zh_titles[0])  #spotcheck

In [None]:
print(f"There are {len(yuewiki)} articles in YUE wiki")
print(f"There are {len(zhwiki)} articles in ZH wiki")

In [None]:
!pip install opencc

In [None]:
yue_list[9222]

In [None]:
# need to translate the titles in ZH from SC to TC
import opencc
converter = opencc.OpenCC('s2hk.json') # Simplified to Traditional Chinese (Hong Kong variant) 簡體到香港繁體
translated_ZHtitles = [converter.convert(title) for title in zh_titles]
print(translated_ZHtitles)

# find intersection between translated_ZHtitles and yue_titles
intersected_titles = set(translated_ZHtitles).intersection(set(yue_titles))
print(intersected_titles)
print(len(intersected_titles))

In [None]:
print(intersected_titles)
print(len(intersected_titles))

In [None]:
def find_title_idx_in_texts(check, list_to_check):
  positions = ["{} {}".format(index1,index2) for index1,value1 in enumerate(list_to_check) for index2,value2 in enumerate(value1) if value2==check]
  nested_index_list = [list(pos.split(" ")) for pos in positions]
  title_index = [sublist for sublist in nested_index_list if '0' in sublist]
  if len(title_index) < 1:
      return None
  flattened_title_index =  [item for sublist in title_index for item in sublist]
  return int(flattened_title_index[0])

find_title_idx_in_texts(check="0", list_to_check=yue_list)

In [None]:
def create_dict(tgtstrings, tgtlist, tgtlang_code, srcstrings, srclist, srclang_code):
    lang_dict = {}
    # tgt lang
    lang_dict[tgtlang_code] = {}
    for idx, item in enumerate(tgtlist):
        wiki_pagename = item[0]
        if wiki_pagename in intersected_titles:
            lang_dict[tgtlang_code][wiki_pagename] = tgtstrings[idx]
    # src lang
    lang_dict[srclang_code] = {}
    for idx1, item1 in enumerate(srclist):
        wiki_pagename_src = item1[0]
        if wiki_pagename_src in intersected_titles:
            lang_dict[srclang_code][wiki_pagename_src] = srcstrings[idx1]
    return lang_dict
  
yuezh_dict = create_dict(yuewiki, yue_list, "yue", zhwiki, zh_list, "zh")

In [None]:
print(yuezh_dict.get("zh", {}).get('愛因斯坦'))
print(yuezh_dict.get("yue", {}).get('愛因斯坦'))

In [None]:
pdd1 = pd.DataFrame.from_dict(yuezh_dict)
pdd1.head(5)

In [None]:
pdd1 = pdd1.dropna(subset=["zh"])
pdd1 #5140 rows

In [None]:
converter = opencc.OpenCC('s2hk.json') # Convert texts to Traditioanl HK Chinese if any in the ZH column

def script_converter(x):
  x1 = converter.convert(x)
  return x1

pdd1["zh_converted"] = pdd1["zh"].apply(script_converter)

In [None]:
pdd1

In [None]:
import numpy as np
import pandas as pd

def df_to_csv(some_df, save_as="zhyue_comparable_wiki.csv"):
    return pd.DataFrame(some_df).to_csv(save_as, index=False, encoding='utf-8')

def df_to_txt(some_df, lang="yue", save_as="comparable_wiki.txt"):
    """ Save the training and test set as .txt to make a training/test set """
    return pd.DataFrame(some_df[lang]).to_csv(save_as,index=False, encoding='utf-8', header=False)

In [None]:
df_to_csv(pdd1, "zhyue_comparable_wiki.csv")
df_to_txt(pdd1, "yue", "yue_comparable_wiki.txt")
df_to_txt(pdd1, "zh", "zh_comparable_wiki.txt")

In [None]:
pdd1.to_pickle("ComparableWIKICorpus-yue-zh.pkl")
# unpickled_df = pd.read_pickle("./dummy.pkl")

In [None]:
# pickle them
import pickle

with open('wiki_corpus.pkl', 'wb') as f:
    pickle.dump(mylist, f)

In [None]:
import pickle

# load the pickled files
with open('../input/rd-nmt/ComparableWIKICorpus-yue-zh.pkl', 'rb') as f:
   myDF = pickle.load(f)

#myDF

In [None]:
for_split_df = myDF.copy()

In [None]:
for_split_df = for_split_df.drop("zh", axis=1)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(for_split_df, test_size=0.15, random_state=1)

In [None]:
print(len(train))
print(len(test))

In [None]:
train_t, val = train_test_split(train, test_size=0.15, random_state=1)

In [None]:
print(len(train_t))
print(len(test))
print(len(val))

In [None]:
df_to_txt(train_t, "yue", "CW_yue_train.txt")
df_to_txt(train_t, "zh_converted", "CW_zh_train.txt")

df_to_txt(test, "yue", "CW_yue_test.txt")
df_to_txt(test, "zh_converted", "CW_zh_test.txt")

df_to_txt(val, "yue", "CW_yue_val.txt")
df_to_txt(val, "zh_converted", "CW_zh_val.txt")

In [None]:
!rm zhwiki_corpus.tar.gz

In [None]:
!tar -czvf cwc_yue_zh_splited.tar.gz *.txt
#"CW_yue_train.txt" "CW_zh_train.txt" "CW_yue_test.txt" "CW_zh_test.txt" "CW_yue_val.txt" "CW_zh_val.txt"

In [None]:
yue_val

## Load data

In [None]:
import os

def load_file(datapath, filename):
    filepath = datapath + "/" + filename
    file_name = filename.replace(".", "_")
    with open(filepath, "r", encoding="utf-8") as inf:
        file_name = inf.read().splitlines()
    return file_name

def load_batch(datapath, batchnames):
    return [load_file(datapath, batchnames[i]) for i in range(len(batchnames))]

files = os.listdir("../input/rd-nmt/zh-yue_data_tokenized/content/zh-yue-toks")
list_of_files = [f for f in files]

batch = load_batch("../input/rd-nmt/zh-yue_data_tokenized/content/zh-yue-toks", list_of_files)

list_of_names = [dataname.replace(".", "_") for dataname in list_of_files]

assert len(batch) == len(list_of_files) == len(list_of_names) 
print(list_of_names)
print(len(batch))

# Set variable name for the each data file
val_yue = batch[0]
test_yue = batch[1]
test_zh = batch[2]
train_zh = batch[3]
train_yue = batch[4]
val_zh = batch[5]

In [None]:
import pandas as pd
import numpy as np

""" Pre-requsite for MNMT models:
    read dataset to dataframe
"""

def df_generate(first_list, second_list, column_first="input_text", column_second="target_text"):
    """ generate a dataframe from two lists """
    return pd.DataFrame(zip(first_list, second_list), columns = [column_first, column_second])
    
train = df_generate(train_zh, train_yue, 'input_text', 'target_text')
test = df_generate(test_zh, test_yue, 'input_text', 'target_text')
val = df_generate(val_zh, val_yue, 'input_text', 'target_text')

In [None]:
#train.head(5)
test.head(5)
#val.head(5)

## NEW APPRAOCH: MNMT FINETUNING

In [None]:
!git clone https://github.com/pytorch/fairseq
%cd fairseq
!pip install --editable ./
%cd ..

!pip install transformers
!pip install simpletransformers

#For tokenization
!pip install sentencepiece 

## Training M2M model
Fairseq library is more CLI oriented rather than pythonic. To fine-tune M2M model, we need to:
1. Download the 418M parameters model first, alongside the tokenizer and vocabulary files.
1. Export the training and validation sentence pairs to text files.
1. Tokenize sentences using the script under fairseq/scripts/spm_encode.py
1. Binarize sentences for faster data loading and training.
1. Fine-tune the model!

In [None]:
#Download  pretrained model, vocabulary and tokenizer
!wget -qq "https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model"
!wget -qq "https://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt"
!wget -qq "https://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt"
!wget -qq "https://dl.fbaipublicfiles.com/m2m_100/language_pairs_small_models.txt"
!wget "https://dl.fbaipublicfiles.com/m2m_100/418M_last_checkpoint.pt"

In [None]:
train_txt = "\n".join(train.input_text.values.tolist())
file = open("zh_txt_train.txt", "w")
file.write(train_txt)
file.close()


train_target_txt = "\n".join(train.target_text.values.tolist())
file = open("yue_txt_train.txt", "w")
file.write(train_target_txt)
file.close()

validation_txt = "\n".join(val.input_text.values.tolist())
file = open("zh_txt_validation.txt", "w")
file.write(validation_txt)
file.close()


validation_target_txt = "\n".join(val.target_text.values.tolist())
file = open("yue_txt_validation.txt", "w")
file.write(validation_target_txt)
file.close()

In [None]:
#Tokenize text
!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=zh_txt_train.txt \
        --outputs=train.zh
        
!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=yue_txt_train.txt \
        --outputs=train.yue
        
!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=zh_txt_validation.txt \
        --outputs=val.zh
        
!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=yue_txt_validation.txt \
        --outputs=val.yue
        
#Binarize tokenized text
!fairseq-preprocess \
    --source-lang zh --target-lang yue \
    --trainpref train \
    --validpref val \
    --thresholdsrc 0 --thresholdtgt 0 \
    --destdir data_bin \
    --srcdict model_dict.128k.txt --tgtdict model_dict.128k.txt
    

#Store checkpoints
#!mkdir 

!fairseq-train data_bin \
  --finetune-from-model  "./418M_last_checkpoint.pt"\
  --save-dir ./ \
  --task translation_multi_simple_epoch \
  --encoder-normalize-before \
  --lang-pairs 'zh-yue' \
  --batch-size 10 \
  --decoder-normalize-before \
  --encoder-langtok src \
  --decoder-langtok \
  --criterion cross_entropy \
  --optimizer adafactor \
  --lr-scheduler cosine \
  --lr 3e-05 \
  --max-update 40000 \
  --update-freq 2 \
  --save-interval 1 \
  --save-interval-updates 5000 \
  --keep-interval-updates 10 \
  --no-epoch-checkpoints \
  --log-format simple \
  --log-interval 2 \
  --patience 10 \
  --arch transformer_wmt_en_de_big \
  --encoder-layers 12 --decoder-layers 12 \
  --share-decoder-input-output-embed --share-all-embeddings \
  --ddp-backend no_c10d \
  --max-epoch 10 \
  --wandb-project "ZH-YUE-M2M"

## Generate synthetic data 

In [None]:
!pip install numpy requests nlpaug
!pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece
!pip install jieba

In [None]:
from nlpaug.util.file.download import DownloadUtil
DownloadUtil.download_fasttext(model_name='wiki-news-300d-1M', dest_dir='.') # Download fasttext model

!pip install gensim>=4.1.2

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_yue.vec

In [None]:
# Word Embeddings Augmenter (fasttext, Cantonese)

import jieba

#jieba.enable_paddle()

def tokenizer(x):
    return jieba.cut(text, cut_all=False)

text = '如何遇上對的人又是故事的一部份呢'
aug = naw.WordEmbsAug(model_type='fasttext', tokenizer=tokenizer,
                      model_path="./wiki.zh_yue.vec") #wiki.zh.vec

augmented_text = aug.augment(text)

print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

In [None]:
import nlpaug.augmenter.word as naw

text = 'The quick brown fox jumped over the lazy dog'
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)
back_translation_aug.augment(text)

## Bitext mining (Mine parallel sentences)

### Environment setup

#### One-time enviornment setup for conda

In [None]:
#Download and install miniconda to /content/miniconda3 directory:
%env PYTHONPATH=
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.9.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.9.2-Linux-x86_64.sh -b -f -p /content/miniconda3

In [None]:
#Add miniconda to the system PATH:
import os
path = '/content/miniconda3/bin:' + os.environ['PATH']
%env PATH=$path

In [None]:
# install faiss (similarity search) via conda

!conda install -c pytorch faiss-gpu # GPU(+CPU) version

In [None]:
#print the version of the packagexyz and its location within the conda directory
import sys
_ = sys.path.append("/content/miniconda3/lib/python3.7/site-packages")
import faiss
print(faiss.__version__, faiss.__file__)

In [None]:
#Copy everything over to Google Drive
!tar -zcf conda_colab.tar.gz miniconda3
!cp conda_colab.tar.gz /content/drive/MyDrive/

#### Copy conda back to Colab (run whenever restarting a notebook)

In [None]:
# copy back the conda installation, and re-setup the environment:
#from google.colab import drive 
#drive.mount('/content/drive')

!tar -xf /content/drive/MyDrive/conda_colab.tar.gz -C ../

import os
path = '/content/miniconda3/bin:' + os.environ['PATH']
%env PATH=$path
%env PYTHONPATH=
import sys
_ = sys.path.append("/content/miniconda3/lib/python3.7/site-packages")

#### Set up LASER

In [None]:
!git clone https://github.com/facebookresearch/LASER

In [None]:
#set the environment variable 'LASER' to the root of the installation
import os
os.environ['LASER'] = "LASER"

In [None]:
!setenv  LASER /content/projects/laser

In [None]:
!export | grep LASER

In [None]:
# download encoders from Amazon s3
!bash ../input/laser_set/install_models.sh

In [None]:
#download third party software
!bash ./LASER/install_external_tools.sh

In [None]:
## # Tokenize and Embed # ##
#  calculate sentence embeddings for a text file
# The input will be tokenized, using the mode of the specified language, 
# BPE will be applied and the sentence embeddings will be calculated.
!bash /./LASER/source/embed.py --encoder bilstm.93langs.2018-12-26.pt --bpe-codes 93langs.fcodes --token-lang yue  --verbose --output yue_embeddings.raw

In [None]:
# The embeddings are stored in float32 matrices in raw binary format. 
# They can be read in Python by:
import numpy as np
dim = 1024
X = np.fromfile("yue_embeddings.raw", dtype=np.float32, count=-1)                                                                          
X.resize(X.shape[0] // dim, dim) # X is a N x 1024 matrix where N is the number of lines in the text file.                                              

In [None]:
## a joint sentence embedding for all the considered languages

In [None]:
# Mine for bitexts

!python /content/LASER/source/mine_bitexts.py \
  zh_para yue_para \
  --src-lang="zh" --trg-lang="yue" \
  --output mined.out --src-embeddings ${bn}.enc.${l1} --trg-embeddings ${bn}.enc.${l2} \
  --mode mine \
  --verbose #--gpu

In [None]:
!git clone https://github.com/pytorch/fairseq

In [None]:
# frequency cleaning
!wget https://dl.fbaipublicfiles.com/m2m_100/histograms.tar.gz 
!tar -xvzf histograms.tar.gz
!python /content/fairseq/examples/m2m_100/process_data/clean_histogram.py --src "zh" --tgt "yue" --src-file zh_comparable_wiki.txt --tgt-file yue_comparable_wiki.txt  --src-output-file source_output.zh --tgt-output-file target_output.yue --histograms /content/histograms

In [None]:
# apply SPM
!wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model
!python /content/fairseq/scripts/spm_encode.py \
    --model spm.128k.model \
    --output_format=piece \
    --inputs=/path/to/input/file/here \
    --outputs=/path/to/output/file/here

# length ratio cleaning
!perl mosesdecoder/scripts/training/clean-corpus-n.perl --ratio 3 /path/to/training/data/train.spm.$src-$tgt $src $tgt /path/to/output/directory/train.spm.$src-$tgt 1 250

# binarize data
!wget https://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt
fairseq-preprocess \
    --source-lang $src --target-lang $tgt \
    --testpref spm.$src.$tgt \
    --thresholdsrc 0 --thresholdtgt 0 \
    --destdir data_bin \
    --srcdict data_dict.128k.txt --tgtdict data_dict.128k.txt

In [None]:
# pretrained MNMT model from zero
# #Ours + 12 layer + RoBT (transformer + merged attention + LaLn + LaLT)
! wget http://data.statmt.org/bzhang/acl2020_multilingual/Ours-L12-RoBT.tar.gz 
! tar xfvz Ours-L12-RoBT.tar.gz 

In [None]:
# download our preprocessed subword models
!wget http://data.statmt.org/bzhang/acl2020_multilingual/submodels.tar.gz
!tar xfvz submodels.tar.gz

In [None]:
# download the evaluation script
!wget http://data.statmt.org/bzhang/acl2020_multilingual/example_evaluation.sh

# install sacrebleu, sentencepiece if necessary
!pip3 install sacrebleu sentencepiece --user
# notice that we use tensorflow, so install tensorflow if necessary
# pip install tensorflow_gpu==1.13.1 --user


# perform decoding
bash example_evaluation.sh

In [None]:
!git clone https://github.com/hangyav/UnsupPSE.git

In [None]:
! ./UnsupPSE/get_third_party.sh

In [None]:
!./UnsupPSE/install_requirements.sh