Import Test Data

In [2]:
import pandas as pd
import json

with open("../week-41/question.json", "r") as f:
    test_data = json.load(f)

df_test = pd.DataFrame(test_data["questions"])

## Part 1: Rule-based Classifier 

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

device = "mps"
model.to(device)

lang_codes = {
    "pt": "por_Latn",
    "ja": "jpn_Jpan",
    "hi": "hin_Deva"
}

def translate_to_en(texts, src_lang):
    tokenizer.src_lang = lang_codes[src_lang]
    outputs = []
    
    for i in tqdm(range(0, len(texts), 8), desc=f"Translating {src_lang}"):
        batch = texts[i:i+8]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        
        bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn")
        
        out = model.generate(**inputs, forced_bos_token_id=bos_token_id)
        outputs.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    
    return outputs

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    tokens = text.split()
    stopwords = {"the","is","was","did","does","a","an","in","on","at","and","of","to","for","with","who","what","when","where", "why","how"}
    return set(t for t in tokens if t not in stopwords)

def predict_answerable(question, context):
    return 1 if len(preprocess(question) & preprocess(context)) > 0 else 0

results = {}

for lang in df_test["lang"].unique():
    subset = df_test[df_test["lang"] == lang].copy()
    
    subset["question_en"] = translate_to_en(subset["question"].tolist(), lang)
    subset["context_en"] = translate_to_en(subset["context"].tolist(), lang)
    
    subset["pred"] = [predict_answerable(q, c) for q, c in tqdm(zip(subset["question_en"], subset["context_en"]), total=len(subset), desc=f"Predicting {lang}")]
    
    acc = accuracy_score(subset["answerable"], subset["pred"])
    prec, rec, f1, _ = precision_recall_fscore_support(subset["answerable"], subset["pred"], average="binary")
    
    results[lang] = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

print("Performance by language:")
for lang, metrics in results.items():
    print(lang, metrics)

Translating pt: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s]
Translating pt: 100%|██████████| 2/2 [00:09<00:00,  4.52s/it]
Predicting pt: 100%|██████████| 10/10 [00:00<00:00, 57772.78it/s]
Translating ja: 100%|██████████| 2/2 [00:01<00:00,  1.65it/s]
Translating ja: 100%|██████████| 2/2 [00:08<00:00,  4.38s/it]
Predicting ja: 100%|██████████| 10/10 [00:00<00:00, 63743.22it/s]
Translating hi: 100%|██████████| 2/2 [00:00<00:00,  2.27it/s]
Translating hi: 100%|██████████| 2/2 [00:08<00:00,  4.02s/it]
Predicting hi: 100%|██████████| 10/10 [00:00<00:00, 50412.31it/s]

Performance by language:
pt {'accuracy': 0.8, 'precision': 0.8, 'recall': 1.0, 'f1': 0.8888888888888888}
ja {'accuracy': 0.8, 'precision': 0.875, 'recall': 0.875, 'f1': 0.875}
hi {'accuracy': 0.9, 'precision': 0.8888888888888888, 'recall': 1.0, 'f1': 0.9411764705882353}





## Part 3: Trained Answerability Classifier

## Part 4: Open QA

## Part 5: Sequence Labeler 

In [None]:
#