Import Test Data

In [9]:
import pandas as pd
import json

with open("../week-41/question.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

df_test = pd.DataFrame(test_data["questions"])

In [10]:
# other imports
import nltk
nltk.download('punkt_tab')      
nltk.download('wordnet')    
nltk.download('omw-1.4') 
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
import torch
import re
from tqdm import tqdm
import numpy as np


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sarene/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sarene/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/sarene/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/sarene/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


## Part 1: Rule-based Classifier 

In [11]:
lemmatizer = WordNetLemmatizer()

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

device = "cpu"
model.to(device)

lang_codes = {
    "pt": "por_Latn",
    "ja": "jpn_Jpan",
    "hi": "hin_Deva"
}

def translate_to_en(texts, src_lang):
    tokenizer.src_lang = lang_codes[src_lang]
    outputs = []
    
    for i in tqdm(range(0, len(texts), 8), desc=f"Translating {src_lang}"):
        batch = texts[i:i+8]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        
        bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn")
        
        out = model.generate(**inputs, forced_bos_token_id=bos_token_id)
        outputs.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    
    return outputs

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    tokens = text.split()
    stopwords = {"the","a","an","this","that","those",      # articles
    "is","was","has","have","had","be",                     # verbs
    "been","do","does","did","are","were"                   
    "in","on","at","of","to","for","with",                  # prepositions
    "into","from","above","below","before","after",
    "and","or","but",                                       # conjunctions
    "there","their","its","it",                             # possessive/pronouns
    "who","what","when","where","why","how","which"         # question words
    }
    return set(lemmatizer.lemmatize(t) for t in tokens if t not in stopwords)

def predict_answerable(question, context):
    a = preprocess(question)
    b = preprocess(context)
    
    return 1 if set(a) <= set(b) else 0

results = {}

for lang in df_test["lang"].unique():
    subset = df_test[df_test["lang"] == lang].copy()
    
    subset["question_en"] = translate_to_en(subset["question"].tolist(), lang)
    subset["context_en"] = translate_to_en(subset["context"].tolist(), lang)
    
    subset["pred"] = [predict_answerable(q, c) for q, c in tqdm(zip(subset["question_en"], subset["context_en"]), total=len(subset), desc=f"Predicting {lang}")]
    
    acc = accuracy_score(subset["answerable"], subset["pred"])
    prec, rec, f1, _ = precision_recall_fscore_support(subset["answerable"], subset["pred"], average="binary")
    
    results[lang] = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

print("Performance by language:")
for lang, metrics in results.items():
    print(lang, metrics)

Translating pt: 100%|██████████| 2/2 [00:01<00:00,  1.18it/s]
Translating pt: 100%|██████████| 2/2 [00:15<00:00,  7.96s/it]
Predicting pt: 100%|██████████| 10/10 [00:00<00:00, 8718.15it/s]
Translating ja: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
Translating ja: 100%|██████████| 2/2 [00:15<00:00,  7.83s/it]
Predicting ja: 100%|██████████| 10/10 [00:00<00:00, 8533.68it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Translating hi: 100%|██████████| 2/2 [00:01<00:00,  1.19it/s]
Translating hi: 100%|██████████| 2/2 [00:14<00:00,  7.09s/it]
Predicting hi: 100%|██████████| 10/10 [00:00<00:00, 8943.08it/s]

Performance by language:
pt {'accuracy': 0.6, 'precision': 1.0, 'recall': 0.5, 'f1': 0.6666666666666666}
ja {'accuracy': 0.2, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
hi {'accuracy': 0.3, 'precision': 0.6666666666666666, 'recall': 0.25, 'f1': 0.36363636363636365}





## Part 3: Trained Answerability Classifier

## Part 4: Open QA

## Part 5: Sequence Labeler 

In [12]:
#