In [9]:
import regex as re
import pandas as pd

splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])

print(f"Train size:",df_train["lang"].value_counts())
print(f"Validation size:",df_val["lang"].value_counts())

Train size: lang
bn    2598
ar    2558
ko    2422
ja    2301
fi    2126
ru    1983
te    1355
Name: count, dtype: int64
Validation size: lang
fi    528
bn    476
ja    456
ar    415
ru    396
te    384
ko    356
Name: count, dtype: int64


In [10]:
print("First Columns", df_train.head(1))

First Columns                                             question  \
0  উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...   

                                             context lang  answerable  \
0  WikiLeaks () is an international non-profit or...   bn        True   

   answer_start answer answer_inlang  
0           182   2006          None  


#### TO DO

#### GENERAL
- [✅] Shape
- [✅] Word Count
- [✅] Token Count

#### SPECIFIC (For each language)🙈🙈🙈🙈
- [✅] 5 Most common words + English translation
- [✅] Analyze type of words
- [] Rule based classifier (answerable or not)
- [] Performance Evaluation (answerable field) 

In [11]:
l = ["ar", "ko", "te", "en"]
df_train = df_train[df_train["lang"].isin(l)]
df_val = df_val[df_val["lang"].isin(l)]

In [12]:
print(f"Train shape: {df_train.shape}")
print(f"Validation shape: {df_val.shape}")

train_lan = df_train['lang'].value_counts()
val_len = df_val['lang'].value_counts()
print(train_lan)
print(val_len) 

Train shape: (6335, 7)
Validation shape: (1155, 7)
lang
ar    2558
ko    2422
te    1355
Name: count, dtype: int64
lang
ar    415
te    384
ko    356
Name: count, dtype: int64


### Word Count

In [13]:
ar, ko, te = df_train[df_train["lang"] == "ar"], df_train[df_train["lang"] == "ko"], df_train[df_train["lang"] == "te"]

def word_list(df):
    words = [re.findall(r'\w+', quest) for quest in df["question"]]
    
    return [w for q in words for w in q]

ar_words = word_list(ar)
ko_words = word_list(ko)
te_words = word_list(te)
print(f"Word counts for Arabic: {len(ar_words)}")
print(f"Word counts for Korean: {len(ko_words)}")
print(f"Word counts for Telugu: {len(te_words)}")


Word counts for Arabic: 16202
Word counts for Korean: 11863
Word counts for Telugu: 7690


### English Word Count

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm
import pandas as pd 

torch.set_default_device('cuda')

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

print(ko["question"].tolist()[:5])  

tokenizer.src_lang = "kor_Hang"
forced_bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn")

translations = []
batch_size = 16 

for i in tqdm(range(0, len(ko), batch_size), desc="Translating"):
    batch = ko["question"].iloc[i:i+batch_size].tolist()
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to("cuda")
    outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id)
    trans = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    translations.extend(trans)

ko["translation"] = translations  

print(pd.Series(translations).value_counts())


['30년 전쟁의 승자는 누구인가?', '엑스선은 누가 발견하였는가?', '아테네에서 언제 가장 최근의 올림픽이 올렸나요?', '세상에서 가장 오래된 방송사는 무엇인가?', '팔레스타인 수도는 어딘가요?']


Translating: 100%|██████████| 152/152 [01:27<00:00,  1.73it/s]

What is the largest star on Earth?                 3
How many countries have served in World War II?    3
What is the largest bone in the human body?        3
How many communist countries are there in 2019?    3
When did the lead singer of N.EX.T. die?           2
                                                  ..
When was the first year that PSP was released?     1
What is the biggest dinosaur?                      1
How many World Cups has France hosted?             1
What's the biggest city in China?                  1
Who was the founder of the Nazi Party?             1
Name: count, Length: 2367, dtype: int64



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ko["translation"] = translations


### Token Count

In [15]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")

def token_count(text):
    return len(tokenizer.tokenize(text))

results = []
for lang_name, df in [("ar", ar), ("ko", ko), ("te", te)]:
    results.append({
        "Language": lang_name,
        "Token Count": df["question"].apply(token_count).sum(),
    })

token_counts = pd.DataFrame(results)
print(token_counts)

  Language  Token Count
0       ar        33733
1       ko        25829
2       te        18365


### Word Count

In [None]:
# 5 Most common words 🙈
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import arabic_reshaper
from bidi.algorithm import get_display


ar_text = " ".join(ar_words)
reshaped_text = arabic_reshaper.reshape(ar_text)
bidi_text = get_display(reshaped_text)
ar_wordcloud = WordCloud(font_path='arial.ttf', background_color='white').generate(bidi_text)

ko_text = " ".join(ko_words)
ko_wordcloud = WordCloud(font_path='malgun.ttf', background_color='white').generate(ko_text)

te_text = " ".join(te_words)
te_wordcloud = WordCloud(font_path='gautami.ttf', background_color='white').generate(te_text)

plt.figure(figsize=(15, 5), dpi=600)

plt.subplot(1, 3, 1)
plt.imshow(ar_wordcloud, interpolation='bilinear')
plt.title('Arabic')
plt.axis('off')

plt.subplot(1, 3, 2)
plt.imshow(ko_wordcloud, interpolation='bilinear')
plt.title('Korean')
plt.axis('off')

plt.subplot(1, 3, 3)
plt.imshow(te_wordcloud, interpolation='bilinear')
plt.title('Telugu')
plt.axis('off')

plt.show()

### Translation

In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

ko = ko.sample(5)
print(ko["question"].tolist())

tokenizer.src_lang = "kor_Hang"
inputs = tokenizer(ko["question"].tolist(), return_tensors="pt", padding=True)
forced_bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn")

outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))


['드래곤 신화는 어느 나라에서 시작 되었는가?', '저영향 개발의 장점은 무엇인가?', '역사상 가장 많은 사상자를 불러온 전투는 무엇인가?', '미국 남북전쟁은 언제 끝났는가?', '오스만 제국의 1대 왕은 누구인가요?']
['In which country did the myth of the dragon originate?', "What's the advantage of low-level development?", 'What is the most deadly battle in history?', 'When did the American Civil War end?', 'Who was the first king of the Ottoman Empire?']


In [18]:
# RULES
# 
# Who             (Proper Nouns)
# What            (???)
# When            (Dates)
# Where           (Proper Nouns)
# Why             (Because)
# How             (Bet on Yes)
# Whose           (Proper Nouns)
# Which           (Bet on Yes)
# How many/much   (Quantity)
#

In [30]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
from tqdm import tqdm

splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

lang_codes = {"ar": "ara_Arab", "ko": "kor_Hang", "te": "tel_Telu"}

def translate_to_en(texts, src_lang):
    tokenizer.src_lang = lang_codes[src_lang]
    outputs = []
    
    for i in tqdm(range(0, len(texts), 8), desc=f"Translating {src_lang}"):
        batch = texts[i:i+8]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        
        bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn")
        
        out = model.generate(**inputs, forced_bos_token_id=bos_token_id)
        outputs.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    
    return outputs

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    tokens = text.split()
    stopwords = {"the","is","a","an","in","on","at","and","of","to","for","with"}
    return set(t for t in tokens if t not in stopwords)

def predict_answerable(question, context):
    return 1 if len(preprocess(question) & preprocess(context)) > 0 else 0

results = {}

for lang in ["ar", "ko", "te"]:
    subset = df_val[df_val["lang"] == lang].copy()
    
    subset["question_en"] = translate_to_en(subset["question"].tolist(), lang)
    subset["context_en"] = translate_to_en(subset["context"].tolist(), lang)
    
    subset["pred"] = [predict_answerable(q, c) for q, c in tqdm(zip(subset["question_en"], subset["context_en"]), total=len(subset), desc=f"Predicting {lang}")]
    
    acc = accuracy_score(subset["answerable"], subset["pred"])
    prec, rec, f1, _ = precision_recall_fscore_support(subset["answerable"], subset["pred"], average="binary")
    
    results[lang] = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

print("Performance by language:")
for lang, metrics in results.items():
    print(lang, metrics)

Translating ar: 100%|██████████| 52/52 [01:12<00:00,  1.39s/it]
Translating ar: 100%|██████████| 52/52 [45:24<00:00, 52.39s/it]
Predicting ar: 100%|██████████| 415/415 [00:00<00:00, 29342.17it/s]
Translating ko: 100%|██████████| 45/45 [00:57<00:00,  1.28s/it]
Translating ko: 100%|██████████| 45/45 [37:46<00:00, 50.37s/it]
Predicting ko: 100%|██████████| 356/356 [00:00<00:00, 29267.56it/s]
Translating te: 100%|██████████| 48/48 [01:04<00:00,  1.35s/it]
Translating te: 100%|██████████| 48/48 [33:19<00:00, 41.65s/it]
Predicting te: 100%|██████████| 384/384 [00:00<00:00, 32483.82it/s]

Performance by language:
ar {'accuracy': 0.7975903614457831, 'precision': 0.8780487804878049, 'recall': 0.8925619834710744, 'f1': 0.8852459016393442}
ko {'accuracy': 0.8258426966292135, 'precision': 0.9508196721311475, 'recall': 0.8605341246290801, 'f1': 0.9034267912772586}
te {'accuracy': 0.7864583333333334, 'precision': 0.8028985507246377, 'recall': 0.9518900343642611, 'f1': 0.8710691823899371}



