# DIKU NLP Course 2020/2021: Group Project

## Preparations

#### Mount Google Drive (datasets are stored there):

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Enable reproducability

Taken from https://nbviewer.jupyter.org/github/copenlu/stat-nlp-book/blob/master/labs/lab_2.ipynb

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from collections import defaultdict
import numpy as np
import random
import torch

def enforce_reproducibility(seed=42):
  # Sets seed manually for both CPU and CUDA
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

  # For atomic operations there is currently no simple way to enforce 
  # determinism, as the order of parallel operations is not known.
  # CUDNN
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  
  # System based
  random.seed(seed)
  np.random.seed(seed)

enforce_reproducibility()

#### Load relevant questions from dataset, also download nltk word tokens

In [3]:
import json
import nltk
nltk.download('punkt')

# we use nltk to tokenize multi-lingual sequences
def tokenize_at_word_level(input):
  return nltk.tokenize.word_tokenize(input)

# define supported languages
supported_languages = ['english', 'arabic', 'finnish', 'korean']

binary_labels = ['YES', 'NO']

# helper function to return all relevant properties
def relevant_properties(question):
  return {
    "question": question['question_text'],
    "document": question['document_plaintext'],
    "answer": question['annotations'][0]['yes_no_answer'].upper()
  }

# helper function to import questions from given file
def import_questions(file):
  questions = {}

  for lang in supported_languages:
    questions[lang] = []

  for line in file:
    question = json.loads(line)
    lang = question['language']

    # add question if dict contains key for it and it has yes/no answer 
    if (lang in list(questions.keys()) and
        relevant_properties(question)['answer'] in binary_labels
      ):
      questions[lang].append(question)

  return questions

# questions used for training our classifier(s)
with open("/content/drive/My Drive/NLP 2020W/tydiqa-v1.0-train.jsonl") as file:
  train_questions = import_questions(file)

# questions used to evaluate our classifier(s)
with open("/content/drive/My Drive/NLP 2020W/tydiqa-v1.0-dev.jsonl") as file:
  dev_questions = import_questions(file)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 2 Representation Learning

#### (a) Vector representations

In assignment 2 (a) we extend the classifier to use features based on the continous vector representation of words. For that we train a Word2Vec model on all the words in the training dataset. We then use the vector representations of the individual words as inputs to the classifier.

In [4]:
%%time
from gensim.models import Word2Vec

w2v_models = {}

for lang in supported_languages:
  print("Started training language: " + lang)
  question_list = [relevant_properties(q) for q in train_questions[lang]]

  inputs = [tokenize_at_word_level(q["question"] + " " + q['document']) for q in question_list]

  # Create CBOW model
  w2v_models[lang] = Word2Vec(inputs, min_count = 1, size = 4, window = 5)

  # Create Skip Gram model
  # w2v_models[lang] = Word2Vec(data, min_count = 1, size = 32, window = 5, sg = 1)
  
  print("Finished training language: " + lang)

Started training language: english
Finished training language: english
Started training language: arabic
Finished training language: arabic
Started training language: finnish
Finished training language: finnish
Started training language: korean
Finished training language: korean
CPU times: user 5min 40s, sys: 2.04 s, total: 5min 42s
Wall time: 4min 49s


In [5]:
# get the features as vector representations
def features_vr(text, lang):
  features = defaultdict(float)
  for w in tokenize_at_word_level(text):
    try:
      # take max value of word2vec as representation
      vector_repr = max(w2v_models[lang][w])
    except:
      vector_repr = 0
    features[vector_repr] += 1.0
  return features

In [6]:
# train logistic regression classifier using vector word representations
def train_binary_log_reg_vec(lang):
  train_data = [relevant_properties(q) for q in train_questions[lang]]
  dev_data = [relevant_properties(q) for q in dev_questions[lang]]

  vectorizer = DictVectorizer()

  # we again use the concatenated question and document text as features
  train_x = vectorizer.fit_transform([features_vr(q['question'] + " " + q['document'], lang) for q in train_data])
  dev_x = vectorizer.transform([features_vr(q['question'] + " " + q['document'], lang) for q in dev_data])

  label_encoder = LabelEncoder()
  train_y = label_encoder.fit_transform([q['answer'] for q in train_data])
  dev_y = label_encoder.fit_transform([q['answer'] for q in dev_data])

  lr = LogisticRegression(C=1000, penalty="l1", random_state=1, solver='liblinear')
  lr.fit(train_x, train_y)

  # inverse_transform transforms labels back to original encoding
  return label_encoder.inverse_transform(lr.predict(dev_x))

In [7]:
%%time
for lang in supported_languages:
  predictions = train_binary_log_reg_vec(lang)
  actual = [relevant_properties(q)['answer'] for q in dev_questions[lang]]

  print('Accuracy for language {}: {}'.format(lang, accuracy_score(actual, predictions)))
  print('F1 score for language {}: {}'.format(lang, f1_score(actual, predictions, average='weighted')))

  import sys


Accuracy for language english: 0.5064935064935064
F1 score for language english: 0.494175711567016
Accuracy for language arabic: 0.8488372093023255
F1 score for language arabic: 0.8383852929176566
Accuracy for language finnish: 0.7340425531914894
F1 score for language finnish: 0.6737796634845364
Accuracy for language korean: 0.9354838709677419
F1 score for language korean: 0.9043010752688171
CPU times: user 2min 31s, sys: 220 ms, total: 2min 31s
Wall time: 2min 31s


#### (b) Feature + vector representations

In assignment 2 (b) we combine features and vector representations. Specifically we are going to use the vector representation of the entire question text instead of a list of the vector representations of the words in the text. 

In [20]:
# represent the entire text as the maximum of its word vectors
def features_vr_max(text, lang):
  max = 0
  for w in tokenize_at_word_level(text):
    try:
      # take max value of word2vec as representation
      vector_repr = max(w2v_models[lang][w])
      if (vector_repr > max):
        max = vector_repr
    except:
      continue
    
  return {max: 1.0}

In [21]:
# train logistic regression using features and vector representations
def train_binary_log_reg_vec_max(lang):
  train_data = [relevant_properties(q) for q in train_questions[lang]]
  dev_data = [relevant_properties(q) for q in dev_questions[lang]]

  vectorizer = DictVectorizer()

  # note that the following two lines are different to 2(a)
  train_x = vectorizer.fit_transform([features_vr_max(q['question'] + " " + q['document'], lang) for q in train_data])
  dev_x = vectorizer.transform([features_vr_max(q['question'] + " " + q['document'], lang) for q in dev_data])

  label_encoder = LabelEncoder()
  train_y = label_encoder.fit_transform([q['answer'] for q in train_data])
  dev_y = label_encoder.fit_transform([q['answer'] for q in dev_data])

  lr = LogisticRegression(C=1000, penalty="l1", random_state=1, solver='liblinear')
  lr.fit(train_x, train_y)

  # inverse_transform transforms labels back to original encoding
  return label_encoder.inverse_transform(lr.predict(dev_x))

In [22]:
%%time
for lang in supported_languages:
  predictions = train_binary_log_reg_vec_max(lang)
  actual = [relevant_properties(q)['answer'] for q in dev_questions[lang]]

  print('Accuracy for language {}: {}'.format(lang, accuracy_score(actual, predictions)))
  print('F1 score for language {}: {}'.format(lang, f1_score(actual, predictions, average='weighted')))

  import sys


Accuracy for language english: 0.6753246753246753
F1 score for language english: 0.5444478002617537
Accuracy for language arabic: 0.872093023255814
F1 score for language arabic: 0.8125090278780874
Accuracy for language finnish: 0.723404255319149
F1 score for language finnish: 0.6073023377987917
Accuracy for language korean: 0.9354838709677419
F1 score for language korean: 0.9043010752688171
CPU times: user 2min 6s, sys: 22.3 ms, total: 2min 6s
Wall time: 2min 6s
