#### Mount Google Drive (datasets are stored there):

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Load relevant questions from dataset, also download nltk word tokens

In [2]:
import json
import nltk
nltk.download('punkt')

# we use nltk to tokenize multi-lingual sequences
def tokenize_at_word_level(input):
  return nltk.tokenize.word_tokenize(input)

# define supported languages
supported_languages = ['english', 'arabic', 'finnish', 'korean']

binary_labels = ['YES', 'NO']

# helper function to return all relevant properties
def relevant_properties(question):
  return {
    "question": question['question_text'],
    "document": question['document_plaintext'],
    "answer": question['annotations'][0]['yes_no_answer'].upper()
  }

# helper function to import questions from given file
def import_questions(file):
  questions = {}

  for lang in supported_languages:
    questions[lang] = []

  for line in file:
    question = json.loads(line)
    lang = question['language']

    # add question if dict contains key for it and it has yes/no answer 
    if (lang in list(questions.keys()) and
        relevant_properties(question)['answer'] in binary_labels
      ):
      questions[lang].append(relevant_properties(question))

  return questions

# questions used for training our classifier(s)
with open("/content/drive/My Drive/NLP 2020W/tydiqa-v1.0-train.jsonl") as file:
  train_questions = import_questions(file)

# questions used to evaluate our classifier(s)
with open("/content/drive/My Drive/NLP 2020W/tydiqa-v1.0-dev.jsonl") as file:
  dev_questions = import_questions(file)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 12.6MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 60.6MB/s 
Collecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 56.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)


In [4]:
# https://huggingface.co/transformers/model_doc/marian.html
import torch
from transformers import MarianMTModel, MarianTokenizer

# we want to translate from english to finnish and arabic (no pre-trained korean model available)
translate_to = ['fi', 'ar']
translated_questions = {}

models = {}
tokenizers = {}

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

for lang in translate_to:
  translated_questions[lang] = []
  model_name = 'Helsinki-NLP/opus-mt-en-' + lang

  models[lang] = MarianMTModel.from_pretrained(model_name).to(device)
  tokenizers[lang] = MarianTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1113.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=312087523.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=803110.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=842303.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1592256.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1146.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=307574661.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=801074.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=916890.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2117975.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=44.0, style=ProgressStyle(description_w…




In [None]:
# we use nltk to tokenize multi-lingual sentences
def tokenize_at_sentence_level(input):
  return nltk.tokenize.sent_tokenize(input)

# translate all english questions to finnish and arabic
for q in train_questions['english']:
  for lang in translate_to:
    question_batches = tokenizers[lang].prepare_seq2seq_batch([q['question']]).to(device)

    question_translated = [
      tokenizers[lang].decode(t, skip_special_tokens=True)
      for t in models[lang].generate(**question_batches)
    ]

    # batch input, translate to given model and decode the output
    doc_translated = []

    # tokenize the documents using nltk
    for sent in tokenize_at_sentence_level(q['document']):
      doc_batches = tokenizers[lang].prepare_seq2seq_batch([sent]).to(device)

      doc_translated.append([
        tokenizers[lang].decode(t, skip_special_tokens=True)
        for t in models[lang].generate(**doc_batches)
      ])
    
    translated_questions[lang].append({
      'question': question_translated,
      'document': " ".join([y for x in doc_translated for y in x])
    })

In [14]:
# now we can compare the translated questions with the english originals (e.g. using google translate)
for lang in translate_to: 
  print(translated_questions[lang][:2])

print(train_questions['english'][:2])

[{'question': ['Onko Creole ranskankielinen pidgin?'], 'document': "Osa ranskankielistä Langues d'oïl Dialects Creoles Francophonie History Phonologinen historia Villers-Cotterêts Anglo-Norman Grammer Adverbs Articles and deterers Pronouns (persoonallinen) Verbs (konjugaatiomorfologia) Ortografia Alphabet Reforms Circumuflex Braille Phonology Elision Contact Aspirated h Help:IPA/Franchvt Ranskan kreoli eli ranskankielinen kreolikieli on kreolikieli (kontaktikieli syntyperäisten puhujien kanssa), jolle ranska on lexifier. Useimmiten tämä ei ole nykyranskalainen vaan 1700-luvun ranskalaiskoiné Pariisista, Ranskan Atlantin satamista ja Ranskan orastavista siirtomaista. Miljoonat ihmiset eri puolilla maailmaa puhuvat ranskankielisiä kreolikieliä pääasiassa Amerikassa ja saaristoissa koko Intian valtamerellä. Tässä artikkelissa on myös tietoa ranskan kielen pidgin-kielistä, yhteyskielistä, joilla ei ole äidinkieliä. Näitä yhteyskieliä ei pidä sekoittaa nykyisiin (ei-kreolisiin) ranskankieli