Data Fetch

In [7]:
import pandas as pd
import regex as re
from datasets import load_dataset

dataset = load_dataset("coastalcph/tydi_xor_rc")
languages = ['ar', 'ko', 'te']

train = dataset["train"].filter(lambda example: example['lang'] in languages).to_pandas()
val = dataset["validation"].filter(lambda example: example['lang'] in languages).to_pandas()

train_data = {lang: train[train['lang'] == lang] for lang in languages}
val_data   = {lang: val[val['lang'] == lang] for lang in languages}


Try three classifiers: Logistic regression / SVM / Naive Bayes

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

models = {
    "LogReg": LogisticRegression(max_iter=1000),
    "SVM": LinearSVC(),
    "LogReg w Bal": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "SVM w Bal": LinearSVC(class_weight="balanced"),
    "NB": MultinomialNB()
}

Training and Evaluation for n-gram based model

In [26]:
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter

results = {}

for lang in languages:
    print(f"{lang}: ")
    train_df, val_df = train_data[lang], val_data[lang]

    train_counts = train_df["answerable"].value_counts(normalize=True)
    val_counts   = val_df["answerable"].value_counts(normalize=True)
    
    # extract tf-idf weights for unigram, bigram, trigram
    vec = TfidfVectorizer(analyzer="word", ngram_range=(1,3), max_features=20000)

    x_train = vec.fit_transform(train_df["context"] + " " + train_df["question"])
    y_train = train_df["answerable"].astype(int).values

    x_val = vec.transform(val_df["context"] + " " + val_df["question"])
    y_val = val_df["answerable"].astype(int).values

    print("Train class distribution:", Counter(y_train))
    print("Val class distribution:", Counter(y_val))
    print("-----------------------------------------------")
    
    results[lang] = {}
    
    for name, model in models.items():
        model.fit(x_train, y_train)
        y_pred = model.predict(x_val)

        acc = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, average="macro")
        
        print(f"{name:>12} | Accuracy: {acc:.4f} | F1: {f1:.4f}")
        
        results[lang][name] = {"accuracy": acc, "macro_f1": f1}


ar: 
Train class distribution: Counter({1: 2303, 0: 255})
Val class distribution: Counter({1: 363, 0: 52})
-----------------------------------------------
      LogReg | Accuracy: 0.8747 | F1: 0.4666
         SVM | Accuracy: 0.9060 | F1: 0.6745
LogReg w Bal | Accuracy: 0.9229 | F1: 0.8115
   SVM w Bal | Accuracy: 0.9325 | F1: 0.8019
          NB | Accuracy: 0.8747 | F1: 0.4666
ko: 
Train class distribution: Counter({1: 2359, 0: 63})
Val class distribution: Counter({1: 337, 0: 19})
-----------------------------------------------
      LogReg | Accuracy: 0.9466 | F1: 0.4863
         SVM | Accuracy: 0.9466 | F1: 0.4863
LogReg w Bal | Accuracy: 0.9382 | F1: 0.5257
   SVM w Bal | Accuracy: 0.9466 | F1: 0.4863
          NB | Accuracy: 0.9466 | F1: 0.4863
te: 
Train class distribution: Counter({1: 1310, 0: 45})
Val class distribution: Counter({1: 291, 0: 93})
-----------------------------------------------
      LogReg | Accuracy: 0.7578 | F1: 0.4311
         SVM | Accuracy: 0.7552 | F1: 0.47

Training and Evaluation for character n-grams

In [33]:
results = {}

for lang in languages:
    print(f"{lang}: ")
    train_df, val_df = train_data[lang], val_data[lang]

    train_counts = train_df["answerable"].value_counts(normalize=True)
    val_counts   = val_df["answerable"].value_counts(normalize=True)
    
    # extract tf-idf weights for unigram, bigram, trigram
    vec = TfidfVectorizer(analyzer="char", ngram_range=(2,40), max_features=20000)

    x_train = vec.fit_transform(train_df["context"] + " " + train_df["question"])
    y_train = train_df["answerable"].astype(int).values

    x_val = vec.transform(val_df["context"] + " " + val_df["question"])
    y_val = val_df["answerable"].astype(int).values

    print("-----------------------------------------------")
    
    results[lang] = {}
    
    for name, model in models.items():
        model.fit(x_train, y_train)
        y_pred = model.predict(x_val)

        acc = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, average="macro")
        
        print(f"{name:>12} | Accuracy: {acc:.4f} | F1: {f1:.4f}")
        
        results[lang][name] = {"accuracy": acc, "macro_f1": f1}


ar: 
-----------------------------------------------
      LogReg | Accuracy: 0.8747 | F1: 0.4666
         SVM | Accuracy: 0.9133 | F1: 0.7117
LogReg w Bal | Accuracy: 0.8988 | F1: 0.7765
   SVM w Bal | Accuracy: 0.9349 | F1: 0.8301
          NB | Accuracy: 0.8747 | F1: 0.4666
ko: 
-----------------------------------------------
      LogReg | Accuracy: 0.9466 | F1: 0.4863
         SVM | Accuracy: 0.9466 | F1: 0.4863
LogReg w Bal | Accuracy: 0.9185 | F1: 0.5643
   SVM w Bal | Accuracy: 0.9466 | F1: 0.5339
          NB | Accuracy: 0.9466 | F1: 0.4863
te: 
-----------------------------------------------
      LogReg | Accuracy: 0.7578 | F1: 0.4311
         SVM | Accuracy: 0.7578 | F1: 0.4870
LogReg w Bal | Accuracy: 0.7318 | F1: 0.5989
   SVM w Bal | Accuracy: 0.7630 | F1: 0.5812
          NB | Accuracy: 0.7578 | F1: 0.4311
