#### Mount Google Drive (datasets are stored there):

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Load relevant questions from dataset, also download nltk word tokens

In [None]:
import json
import nltk
nltk.download('punkt')

# we use nltk to tokenize multi-lingual sequences
def tokenize_at_word_level(input):
  return nltk.tokenize.word_tokenize(input)

# define supported languages
supported_languages = ['english', 'arabic', 'finnish', 'korean']

binary_labels = ['YES', 'NO']

# helper function to return all relevant properties
def relevant_properties(question):
  return {
    "question": question['question_text'],
    "document": question['document_plaintext'],
    "answer": question['annotations'][0]['yes_no_answer'].upper()
  }

# helper function to import questions from given file
def import_questions(file):
  questions = {}

  for lang in supported_languages:
    questions[lang] = []

  for line in file:
    question = json.loads(line)
    lang = question['language']

    # add question if dict contains key for it and it has yes/no answer 
    if (lang in list(questions.keys()) and
        relevant_properties(question)['answer'] in binary_labels
      ):
      questions[lang].append(relevant_properties(question))

  return questions

# questions used for training our classifier(s)
with open("/content/drive/My Drive/NLP 2020W/tydiqa-v1.0-train.jsonl") as file:
  train_questions = import_questions(file)

# questions used to evaluate our classifier(s)
with open("/content/drive/My Drive/NLP 2020W/tydiqa-v1.0-dev.jsonl") as file:
  dev_questions = import_questions(file)

In [None]:
! pip install transformers

In [None]:
# https://huggingface.co/transformers/model_doc/marian.html
import torch
from transformers import MarianMTModel, MarianTokenizer

# we want to translate from english to finnish and arabic (no pre-trained korean model available)
translate_to = ['fi', 'ar']
translated_questions = {}

models = {}
tokenizers = {}

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

for lang in translate_to:
  translated_questions[lang] = []
  model_name = 'Helsinki-NLP/opus-mt-en-' + lang

  models[lang] = MarianMTModel.from_pretrained(model_name).to(device)
  tokenizers[lang] = MarianTokenizer.from_pretrained(model_name)

In [None]:
%%time

# we use nltk to tokenize multi-lingual sentences
def tokenize_at_sentence_level(input):
  return nltk.tokenize.sent_tokenize(input)

# translate all english questions to finnish and arabic
for q in train_questions['english']:
  for lang in translate_to:
    question_batches = tokenizers[lang].prepare_seq2seq_batch([q['question']]).to(device)

    question_translated = [
      tokenizers[lang].decode(t, skip_special_tokens=True)
      for t in models[lang].generate(**question_batches)
    ]

    # batch input, translate to given model and decode the output
    doc_translated = []

    # tokenize the documents using nltk
    for sent in tokenize_at_sentence_level(q['document']):
      doc_batches = tokenizers[lang].prepare_seq2seq_batch([sent]).to(device)

      doc_translated.append([
        tokenizers[lang].decode(t, skip_special_tokens=True)
        for t in models[lang].generate(**doc_batches)
      ])
    
    translated_questions[lang].append({
      'question': question_translated,
      'document': " ".join([y for x in doc_translated for y in x])
    })

    print("{}/{}".format(len(translated_questions['ar']) + len(translated_questions['fi']), 2 * len(train_questions['english'])))



In [None]:
translated_questions