# DIKU NLP Course 2020/2021: Group Project

## Preparations

#### Mount Google Drive (datasets are stored there):

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Enable reproducability

Taken from https://nbviewer.jupyter.org/github/copenlu/stat-nlp-book/blob/master/labs/lab_2.ipynb

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from collections import defaultdict
import numpy as np
import random
import torch

def enforce_reproducibility(seed=42):
  # Sets seed manually for both CPU and CUDA
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

  # For atomic operations there is currently no simple way to enforce 
  # determinism, as the order of parallel operations is not known.
  # CUDNN
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  
  # System based
  random.seed(seed)
  np.random.seed(seed)

enforce_reproducibility()

#### Load relevant questions from dataset, also download nltk word tokens

In [3]:
import json
import nltk
nltk.download('punkt')

# we use nltk to tokenize multi-lingual sequences
def tokenize_at_word_level(input):
  return nltk.tokenize.word_tokenize(input)

# define supported languages
supported_languages = ['english', 'arabic', 'finnish', 'korean']

binary_labels = ['YES', 'NO']

# helper function to return all relevant properties
def relevant_properties(question):
  return {
    "question": question['question_text'],
    "document": question['document_plaintext'],
    "answer": question['annotations'][0]['yes_no_answer'].upper()
  }

# helper function to import questions from given file
def import_questions(file):
  questions = {}

  for lang in supported_languages:
    questions[lang] = []

  for line in file:
    question = json.loads(line)
    lang = question['language']

    # add question if dict contains key for it and it has yes/no answer 
    if (lang in list(questions.keys()) and
        relevant_properties(question)['answer'] in binary_labels
      ):
      questions[lang].append(question)

  return questions

# questions used for training our classifier(s)
with open("/content/drive/My Drive/NLP 2020W/tydiqa-v1.0-train.jsonl") as file:
  train_questions = import_questions(file)

# questions used to evaluate our classifier(s)
with open("/content/drive/My Drive/NLP 2020W/tydiqa-v1.0-dev.jsonl") as file:
  dev_questions = import_questions(file)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 1 Introduction to NLP

### 1.1 Preprocessing and dataset analysis

#### (a) Preprocessing pipeline

In [4]:
tokenize_at_word_level("This method may be used to tokenize sentences.")

['This', 'method', 'may', 'be', 'used', 'to', 'tokenize', 'sentences', '.']

#### (b) Most common first tokens and common question words

In [5]:
def first_tokens(questions):
  first_tokens = {}

  # store counter for each first token within dictionary
  for lang in list(questions.keys()):
    if lang not in first_tokens:
      first_tokens[lang] = {}
    for question in questions[lang]:
      token = tokenize_at_word_level(question['question_text'])[0]
      if token in first_tokens[lang]:
        first_tokens[lang][token] += 1
      else:
        first_tokens[lang][token] = 1

  return first_tokens

first_tokens(train_questions)

{'arabic': {'أصبحت': 1,
  'الستيرويد': 1,
  'ما': 1,
  'متى': 1,
  'من': 2,
  'نشأت': 1,
  'هل': 1369,
  'هي': 3,
  'يشجع': 1},
 'english': {'Are': 58,
  'Can': 49,
  'Did': 65,
  'Do': 49,
  'Does': 74,
  'Has': 16,
  'Have': 3,
  'How': 1,
  'IS': 1,
  'In': 1,
  'Is': 201,
  'Was': 33,
  'Were': 5,
  'When': 1,
  'Which': 1,
  'Will': 3,
  'can': 1},
 'finnish': {'Aiheuttaako': 4,
  'Alentavatko': 1,
  'Asui': 1,
  'Asuiko': 1,
  'Asuivatko': 1,
  'Asuuko': 4,
  'Asuvatko': 1,
  'Auttoiko': 1,
  'Elääkö': 8,
  'Erikoistuivatko': 1,
  'Johtuuko': 1,
  'Julistivatko': 1,
  'Juontuvatko': 1,
  'Järjestetäänkö': 1,
  'Kaatuiko': 1,
  'Kannattaako': 1,
  'Kasvaako': 5,
  'Kertooko': 1,
  'Kiduttivatko': 1,
  'Kiinnitetäänkö': 1,
  'Kuinka': 1,
  'Kuljettaako': 1,
  'Kun': 1,
  'Kuoleeko': 1,
  'Kuolivatko': 1,
  'Kuuluuko': 17,
  'Käytettiinkö': 1,
  'Käytetäänkö': 3,
  'Lasketaanko': 1,
  'Laulavatko': 1,
  'Liikkuvatko': 1,
  'Lopettiko': 1,
  'Luetaanko': 1,
  'Löytyykö': 1,
  'Makset

### 1.2 Binary Question Classification

We chose to go with a simple logistic regression model, the input being a concatenation of the question and document.

In [6]:
# return number of occurrences of words as feature dictionary
def features(text, lang):
  features = defaultdict(float)
  for w in tokenize_at_word_level(text):
    features[w] += 1.0
  return features

In [7]:
# train logistic regression classifier
def train_binary_log_reg(lang):
  train_data = [relevant_properties(q) for q in train_questions[lang]]
  dev_data = [relevant_properties(q) for q in dev_questions[lang]]

  vectorizer = DictVectorizer()

  # we again use the concatenated question and document text as features
  train_x = vectorizer.fit_transform([features(q['question'] + " " + q['document'], lang) for q in train_data])
  dev_x = vectorizer.transform([features(q['question'] + " " + q['document'], lang) for q in dev_data])

  label_encoder = LabelEncoder()
  train_y = label_encoder.fit_transform([q['answer'] for q in train_data])
  dev_y = label_encoder.fit_transform([q['answer'] for q in dev_data])

  lr = LogisticRegression(C=1000, penalty="l1", random_state=1, solver='liblinear')
  lr.fit(train_x, train_y)

  # inverse_transform transforms labels back to original encoding
  return label_encoder.inverse_transform(lr.predict(dev_x))

In [8]:
%%time
from sklearn.metrics import accuracy_score, f1_score

# train binary classifier for all languages and evaluate the predictions
for lang in supported_languages:
  predictions = train_binary_log_reg(lang)
  actual = [relevant_properties(q)['answer'] for q in dev_questions[lang]]

  print('Accuracy for language {}: {}'.format(lang, accuracy_score(actual, predictions)))
  print('F1 score for language {}: {}'.format(lang, f1_score(actual, predictions, average='weighted')))

Accuracy for language english: 0.5194805194805194
F1 score for language english: 0.503806081787733
Accuracy for language arabic: 0.813953488372093
F1 score for language arabic: 0.813953488372093
Accuracy for language finnish: 0.723404255319149
F1 score for language finnish: 0.6412424813717568
Accuracy for language korean: 0.9354838709677419
F1 score for language korean: 0.9043010752688171
CPU times: user 1min 35s, sys: 409 ms, total: 1min 35s
Wall time: 1min 35s
