In [1]:
import fasttext
import numpy as np
from huggingface_hub import hf_hub_download
import pandas as pd
import json

In [7]:
#Load OpenLID Model
# open_model_path = hf_hub_download(repo_id="/Users/yvon/dat/LangIds/OpenLID/", filename="lid201_model.bin")
open_model_path = "/Users/yvon/dat/LangId/OpenLID/lid201-model.ftz"
open_model = fasttext.load_model(open_model_path)



In [8]:
#Checking Labeling in OpenLID
print(f"Number of Languages in OpenLID: {len(open_model.get_labels())}")
the_labels = open_model.get_labels()
[label for label in the_labels if "eu" in label or "es" in label or "spa" in label or 'eng' in label or 'tur' in label]

Number of Languages in OpenLID: 201


['__label__eng_Latn',
 '__label__est_Latn',
 '__label__pes_Arab',
 '__label__tur_Latn',
 '__label__spa_Latn',
 '__label__deu_Latn',
 '__label__eus_Latn',
 '__label__ben_Beng',
 '__label__ces_Latn',
 '__label__asm_Beng',
 '__label__mni_Beng']

In [10]:
#Load GlotLID Model
# glot_model_path = hf_hub_download(repo_id="/Users/yvon/dat/LangIds/GlotLID/", filename="model_v3.bin", cache_dir=None)
glot_model_path = "/Users/yvon/dat/LangId/GlotLID/model_v3.bin"
glot_model = fasttext.load_model(glot_model_path)



In [11]:
#Checking Labeling in GlotLID
print(f"Number of Languages in GlotLID: {len(glot_model.get_labels())}")
the_labels = glot_model.get_labels()
[label for label in the_labels if "eu" in label or "es" in label or "spa" in label]

Number of Languages in GlotLID: 2102


['__label__spa_Latn',
 '__label__deu_Latn',
 '__label__ces_Latn',
 '__label__eus_Latn',
 '__label__meu_Latn',
 '__label__wes_Latn',
 '__label__ese_Latn',
 '__label__ses_Latn',
 '__label__leu_Latn',
 '__label__des_Latn',
 '__label__esk_Latn',
 '__label__aeu_Latn',
 '__label__esu_Latn',
 '__label__esi_Latn',
 '__label__ess_Latn']

In [12]:
#Load MaskLID
import sys
sys.path.append("/Users/yvon/CNRS-DRIVE/Code/MaskLID.LMU")

from masklid import MaskLID
# mask_model_path = './model_v3.bin' #GlotLID basis(GLOTLid Latest version so it has 2000+ languages)
mask_base_glot_model = glot_model
mask_model_path = glot_model_path

masklid_model = MaskLID(mask_model_path , languages = -1)
print(len(mask_base_glot_model.get_labels()))

2102




TURKISH ENGLISH DATASET

In [14]:
# Test data path : 
csdata_path = "/Users/yvon/dat/Projects/Code-Switching/"

with open(csdata_path + "tur-eng/TR-EN CS Corpus - with language tags.txt", encoding="utf-8") as f:
    blocks = f.read().strip().split("\n\n")

# print(blocks)
print(f"number of all sentences :{len(blocks)}")

df_rows = []
cs_blocks= [] #saving the tokens with labels (t/e) for masking

non_cs_blocks = []

tr_only_sentences =[]

for block in blocks:
    lines = block.strip().split("\n")
    tokens = [line.split()[0] for line in lines if line.strip()]
    langs = set(("__label__tur_Latn" if line.split()[1] =="t" else "__label__eng_Latn") for line in lines if line.strip())
    sentence = " ".join(tokens)
   
    if langs =={"__label__tur_Latn","__label__eng_Latn"}:
        df_rows.append({"Text": sentence, "True Labels": langs})
        cs_blocks.append(lines)
    else:
        non_cs_blocks.append({"Text":sentence, "True Labels": langs})
        
    df_tr_eng = pd.DataFrame(df_rows)
    df_mon = pd.DataFrame(non_cs_blocks)


for lines in cs_blocks:
    tr_tokens = [line.split()[0] for line in lines if line.strip() and line.split()[1] =="t"]
    sentence = " ".join(tr_tokens)
    tr_only_sentences.append(sentence)

    df_tr_only = pd.DataFrame({"Text": tr_only_sentences, "True Labels": [{"__label__tur_Latn"}] * len(tr_only_sentences)})


#Set whether want to see the full df or not
# pd.reset_option("display.max_colwidth", None)


#see a full row 
# print(df_tr_eng.loc[2, "Text"])

# print(df_tr_eng.head(5))
print(f"Number of cs sentences :{len(df_tr_eng)}")


# print(df_mon)
print(f"Number of monolingual sentences : {len(non_cs_blocks)}")


# print(df_tr_only.head(5))
print(f"Number of turkish only sentences : {len(df_tr_only)}")

number of all sentences :377
Number of cs sentences :372
Number of monolingual sentences : 5
Number of turkish only sentences : 372


BASQ-SPANISH DATASET

In [15]:
# Load file JSON
file_path = csdata_path + "eus-spa/valid_utterances.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Save data into a separate list
cs_rows = []
basque_rows = []
spanish_rows = []

for entry in data:
    # Take the code-switching sentence
    for cs_example in entry.get("code-switching", []):
        text = cs_example["text"]
        # lang_label = cs_example["lang_proportion"]
        cs_rows.append((text, {"__label__eus_Latn","__label__spa_Latn"}))

    # Get pure sentences from 'referent' based on source_lang
    if entry["source_lang"] == "eu":
        basque_rows.append((entry["referent"], {"__label__eus_Latn"}))
    elif entry["source_lang"] == "es":
        spanish_rows.append((entry["referent"], {"__label__spa_Latn"}))

# Convert to DataFrame
df_basq_spanish = pd.DataFrame(cs_rows, columns=["Text", "True Labels"])
df_eu = pd.DataFrame(basque_rows, columns=["Text", "True Labels"])
df_es = pd.DataFrame(spanish_rows, columns=["Text", "True Labels"])

# Combine everything
df_all = pd.concat([df_basq_spanish, df_eu, df_es], ignore_index=True)

# Check the results
print("Number of sentences code-switching:", len(df_basq_spanish))
print("Number of sentencesv Basque:", len(df_eu))
print("Number of sentences Spanish:", len(df_es))
# print(df_all.sample(5))

df_es.head(5)

Number of sentences code-switching: 1377
Number of sentencesv Basque: 449
Number of sentences Spanish: 478


Unnamed: 0,Text,True Labels
0,es la web una de las formas para pedir las cla...,{__label__spa_Latn}
1,"hola, quiero subvenciones dentro de plazo",{__label__spa_Latn}
2,dame el listado de ayudas,{__label__spa_Latn}
3,va a seguir nublado?,{__label__spa_Latn}
4,quiero saber si le vais a enviar las claves a ...,{__label__spa_Latn}


HINDI-ENGLISH DATASET

In [19]:
import pandas as pd

def parse_hineng(path):
    with open(path, encoding="utf-8") as f:
        content = f.read().strip().split("# sent_enum = ")
        blocks = [b for b in content if b.strip()]

    cs_rows, en_rows, hi_rows = [], [], []

    for block in blocks:
        tokens = []
        langs = set()

        lines = [line for line in block.splitlines() if line.strip()]

        for line in lines[1:]:
            parts = line.strip().split()
            if len(parts) != 2:
                continue

            token, lang = parts

            if lang == "lang1":
                tokens.append(token)
                langs.add("__label__eng_Latn")
            elif lang == "lang2":
                tokens.append(token)
                langs.add("__label__hin_Deva")

        if not tokens:
            continue

        sentence = " ".join(tokens)

        if langs == {"__label__eng_Latn", "__label__hin_Deva"}:
            cs_rows.append((sentence, langs))
        elif langs == {"__label__eng_Latn"}:
            en_rows.append((sentence, langs))
        elif langs == {"__label__hin_Deva"}:
            hi_rows.append((sentence, langs))

    return cs_rows, en_rows, hi_rows


# file_path = csdata_path + "hin-eng/lince_lid_en-hi_train.conll"
file_path = csdata_path + "hin-eng/lince_lid_en-hi_dev.conll"
cs_rows, en_rows, hi_rows = parse_hineng(file_path)

df_he = pd.DataFrame(cs_rows, columns=["Text", "True Labels"])
df_en1 = pd.DataFrame(en_rows, columns=["Text", "True Labels"])
df_h = pd.DataFrame(hi_rows, columns=["Text", "True Labels"])

print(f"Hindi-English (code-switched) : {len(df_he)}")
print(f"English only                  : {len(df_en1)}")
print(f"Hindi only                    : {len(df_h)}")

Hindi-English (code-switched) : 322
English only                  : 386
Hindi only                    : 31


NEPALI-ENGLISH DATASET

In [18]:
import pandas as pd

def parse(path):
    with open(path, encoding="utf-8") as f:
        content = f.read().strip().split("# sent_enum = ")
        blocks = [b for b in content if b.strip()]

    cs_rows, en_rows, np_rows = [], [], []

    for block in blocks:
        tokens = []
        langs = set()

        lines = [line for line in block.splitlines() if line.strip()]

        for line in lines[1:]:
            parts = line.strip().split()
            if len(parts) != 2:
                continue

            token, lang = parts

            if lang == "lang1":
                tokens.append(token)
                langs.add("__label__eng_Latn")
            elif lang == "lang2":
                tokens.append(token)
                langs.add("__label__npi_Deva")

        if not tokens:
            continue

        sentence = " ".join(tokens)

        if langs == {"__label__eng_Latn", "__label__npi_Deva"}:
            cs_rows.append((sentence, langs))
        elif langs == {"__label__eng_Latn"}:
            en_rows.append((sentence, langs))
        elif langs == {"__label__npi_Deva"}:
            np_rows.append((sentence, langs))

    return cs_rows, en_rows, np_rows

# Uji coba
# file_path = csdata_path + "nep-eng/lince_lid_en-ne_train.conll"
file_path = csdata_path + "nep-eng/lince_lid_en-ne_dev.conll"
cs_rows, en_rows, np_rows = parse(file_path)

df_ne = pd.DataFrame(cs_rows, columns=["Text", "True Labels"])
df_en2 = pd.DataFrame(en_rows, columns=["Text", "True Labels"])
df_n = pd.DataFrame(np_rows, columns=["Text", "True Labels"])

print(f"Code-switched : {len(df_ne)}")
print(f"English only  : {len(df_en2)}")
print(f"Nepali only   : {len(df_n)}")

print("\nContoh baris dari Nepali only:")
print(df_ne.head(3))

Code-switched : 943
English only  : 172
Nepali only   : 217

Contoh baris dari Nepali only:
                                                Text  \
0  la hamlai bhetna paayeko will be your greatest...   
1  areyy thahrachha j hos I told ya I am a good o...   
2                          Nindra is on date I guess   

                              True Labels  
0  {__label__npi_Deva, __label__eng_Latn}  
1  {__label__npi_Deva, __label__eng_Latn}  
2  {__label__npi_Deva, __label__eng_Latn}  


ENGLISH ONLY

In [20]:
df_e = pd.concat([df_en1, df_en2], ignore_index=True)

print(f"Final English Only : {len(df_e)}")

Final English Only : 558


PREPROCESSING

In [12]:
# df_te = df_tr_eng.copy()
# df_t = df_tr_only.copy()
# df_bs = df_basq_spanish.copy()
# df_b = df_eu.copy()
# df_s = df_es.copy()

# df_he = df_he.copy()
# df_h = df_h.copy()
# df_ne = df_ne.copy()
# df_n = df_n.copy()
# df_e = df_e.copy()

# import re
# import unicodedata
# from sacremoses import MosesPunctNormalizer
# import emoji

# # Patterns mirip di paper MaskLID
# URL_PATTERN = re.compile(r'https?://\S+|www\.\S+')
# HASHTAG_PATTERN = re.compile(r'#\w+')
# MENTION_PATTERN = re.compile(r'@\w+')
# RESERVED_WORDS_PATTERN = re.compile(r'\b(RT|FAV|VIA)\b', re.IGNORECASE)
# NUMBERS_PATTERN = re.compile(r'\b\d+\b')

# mpn = MosesPunctNormalizer(lang='en')

# def remove_non_printing(text):
#     return ''.join(c if c.isprintable() else ' ' for c in text)

# def clean_1(text):
#     text = mpn.normalize(text)
#     text = remove_non_printing(text)
#     text = unicodedata.normalize("NFKC", text)
#     text = emoji.replace_emoji(text, "")
#     text = text.lower()  # optional, tergantung config asli
#     text = URL_PATTERN.sub('', text)
#     text = HASHTAG_PATTERN.sub('', text)
#     text = MENTION_PATTERN.sub('', text)
#     text = RESERVED_WORDS_PATTERN.sub('', text)
#     text = NUMBERS_PATTERN.sub('', text)
#     text = ' '.join(text.strip().split())  # normalisasi spasi
#     return text

# def byte_len(text):
#     return len(text.encode("utf-8"))

# #Apply Cleaning
# df_te["Text"] = df_te["Text"].apply(clean_1)
# df_t["Text"] = df_t["Text"].apply(clean_1)
# df_bs["Text"] = df_bs["Text"].apply(clean_1)
# df_b["Text"] = df_b["Text"].apply(clean_1)
# df_s["Text"] = df_s["Text"].apply(clean_1)

# df_ne["Text"] = df_ne["Text"].apply(clean_1)
# df_n["Text"] = df_n["Text"].apply(clean_1)
# df_he["Text"] = df_he["Text"].apply(clean_1)
# df_h["Text"] = df_h["Text"].apply(clean_1)
# df_e["Text"] = df_e["Text"].apply(clean_1)

# # Filter based on byte-length
# df_te = df_te[df_te["Text"].apply(byte_len) > 40]
# df_t = df_t[df_t["Text"].apply(byte_len) > 20]
# df_bs = df_bs[df_bs["Text"].apply(byte_len) > 40]
# df_b = df_b[df_b["Text"].apply(byte_len) > 20]
# df_s = df_s[df_s["Text"].apply(byte_len) > 20]

# df_he = df_he[df_he["Text"].apply(byte_len)>40]
# df_n = df_n[df_n["Text"].apply(byte_len)>20]
# df_ne = df_ne[df_ne["Text"].apply(byte_len)>40]
# df_n = df_n[df_n["Text"].apply(byte_len)>20]
# df_e = df_e[df_e["Text"].apply(byte_len)>20]


# print(f"Turkish-English : {len(df_te)}")
# print(f"Basque-Spanish : {len(df_bs)}")

# print(f"Hindi-English: {len(df_he)}")
# print(f"Nepali-English: {len(df_ne)}")

# print(f"Basque Only : {len(df_b)}")
# print(f"Spanish Only : {len(df_s)}")
# print(f"Turkish Only : {len(df_t)}")

# print(f"Hindi Only: {len(df_h)}")
# print(f"Nepali Only: {len(df_n)}")
# print(f"English Only: {len(df_e)}")

In [24]:
#data-preprocess
df_te = df_tr_eng.copy()
df_t = df_tr_only.copy()
df_bs = df_basq_spanish.copy()
df_b = df_eu.copy()
df_s = df_es.copy()

df_he = df_he.copy()
df_h = df_h.copy()
df_ne = df_ne.copy()
df_n = df_n.copy()
df_e = df_e.copy()


import re
import unicodedata
# from sacremoses import MosesPunctNormalizer
# import emoji

# Patterns mirip di paper MaskLID
URL_PATTERN = re.compile(r'https?://\S+|www\.\S+')
HASHTAG_PATTERN = re.compile(r'#\w+')
MENTION_PATTERN = re.compile(r'@\w+')
RESERVED_WORDS_PATTERN = re.compile(r'\b(RT|FAV|VIA)\b', re.IGNORECASE)
NUMBERS_PATTERN = re.compile(r'\b\d+\b')

# mpn = MosesPunctNormalizer(lang='en')

def remove_non_printing(text):
    return ''.join(c if c.isprintable() else ' ' for c in text)

def clean_1(text):
    text = text.lower()  
    text = ' '.join(text.strip().split())  
    return text

def byte_len(text):
    return len(text.encode("utf-8"))

#Apply Cleaning
df_te["Text"] = df_te["Text"].apply(clean_1)
df_t["Text"] = df_t["Text"].apply(clean_1)
df_bs["Text"] = df_bs["Text"].apply(clean_1)
df_b["Text"] = df_b["Text"].apply(clean_1)
df_s["Text"] = df_s["Text"].apply(clean_1)

df_ne["Text"] = df_ne["Text"].apply(clean_1)
df_n["Text"] = df_n["Text"].apply(clean_1)
df_he["Text"] = df_he["Text"].apply(clean_1)
df_h["Text"] = df_h["Text"].apply(clean_1)
df_e["Text"] = df_e["Text"].apply(clean_1)

# Filter based on byte-length
df_te = df_te[df_te["Text"].apply(byte_len) > 40]
df_t = df_t[df_t["Text"].apply(byte_len) > 20]
df_bs = df_bs[df_bs["Text"].apply(byte_len) > 40]
df_b = df_b[df_b["Text"].apply(byte_len) > 20]
df_s = df_s[df_s["Text"].apply(byte_len) > 20]

df_he = df_he[df_he["Text"].apply(byte_len)>40]
df_n = df_n[df_n["Text"].apply(byte_len)>20]
df_ne = df_ne[df_ne["Text"].apply(byte_len)>40]
df_n = df_n[df_n["Text"].apply(byte_len)>20]
df_e = df_e[df_e["Text"].apply(byte_len)>20]


print(f"Turkish-English : {len(df_te)}")
print(f"Basque-Spanish : {len(df_bs)}")

print(f"Hindi-English: {len(df_he)}")
print(f"Nepali-English: {len(df_ne)}")

print(f"Basque Only : {len(df_b)}")
print(f"Spanish Only : {len(df_s)}")
print(f"Turkish Only : {len(df_t)}")

print(f"Hindi Only: {len(df_h)}")
print(f"Nepali Only: {len(df_n)}")
print(f"English Only: {len(df_e)}")

Turkish-English : 339
Basque-Spanish : 446
Hindi-English: 235
Nepali-English: 623
Basque Only : 357
Spanish Only : 356
Turkish Only : 341
Hindi Only: 31
Nepali Only: 177
English Only: 476


In [26]:
df_te.to_csv("df_te.csv", index=False)
df_t.to_csv("df_turk.csv", index=False)
# df_bs.to_csv("df_bs.csv", index=False)
# df_basq.to_csv("df_basq.csv", index=False)
# df_span.to_csv("df_span.csv", index=False)
# df_he.to_csv("df_he.csv", index=False)
# df_n.to_csv("df_n.csv", index=False)
# df_ne.to_csv("df_ne.csv", index=False)
# df_n.to_csv("df_n.csv", index=False)
# df_e.to_csv("df_e.csv", index=False)

MATRIX PERFORMANCE

MONO

In [27]:
def em_pm_mono(y_true, y_pred):
    em = 0
    pm = 0

    for true_set, pred_set in zip(y_true, y_pred):
        if pred_set == true_set:
            em += 1
            pm += 1  # EM is always also PM

        elif pred_set & true_set:
            # Overlap exists, but not exact match
            pm += 1

    return em, pm

CS

In [28]:
def em_pm_cs(y_true, y_pred):
    em = 0
    pm = 0

    for gold, pred in zip(y_true, y_pred):
        if pred == gold:
            em += 1
            pm += 1
        elif len(pred) == 1 and next(iter(pred)) in gold:
            # predicted exactly one of the two languages
            pm += 1

    return em, pm

In [29]:
def count_fp(y_true, y_pred):
    fp = 0
    for true_set, pred_set in zip(y_true, y_pred):
        if len(pred_set) > 1:  # CS only
            if len(true_set - pred_set) == 1 :
                fp += 1
    return fp

In [30]:
df_eval = pd.concat([
    df_te.assign(Type='cs'),
    df_bs.assign(Type='cs'),
    df_he.assign(Type='cs'),
    df_ne.assign(Type='cs'),
    
    df_t.assign(Type='mono'),
    df_b.assign(Type='mono'),
    df_s.assign(Type='mono'),
    df_h.assign(Type='mono'),
    df_n.assign(Type='mono'),
    df_e.assign(Type='mono')
])

df_eval_o = df_eval.copy()
df_eval_g = df_eval.copy()
df_eval_om = df_eval.copy()
df_eval_gm = df_eval.copy()

1. OpenLID ONLY

In [31]:
# pd.set_option("display.max_colwidth", None)
def open_lid(text):
    labels, probs = open_model.predict(text, k=2)
    probs = np.asarray(probs) #convert probs to safe format
    return {label for label,prob in zip(labels,probs) if prob>= 0.3}
    
def apply_open_lid(df):
    df['Pred'] = df['Text'].apply(open_lid)
        
    return df

In [32]:
# Apply model to all data
df_eval_o = apply_open_lid(df_eval_o)  # results are stored in 'Predicted Labels'

# Normalize the ground truth label column to set
df_eval_o["True Labels"] = df_eval_o["True Labels"].apply(lambda x: set(x) if isinstance(x, list) or isinstance(x, set) else set(x.split()))

MONO-ENGLISH

In [33]:
subset_e_o  = df_eval_o[df_eval_o["True Labels"].apply(lambda x: x in df_e["True Labels"].values)]    # Mono English

em_e_o, pm_e_o = em_pm_mono(subset_e_o["True Labels"], subset_e_o["Pred"])

# Output hasil
print(f"#EM: {em_e_o}")
print(f"#PM: {pm_e_o}")

#EM: 427
#PM: 436


TURKISH-ENGLISH EVAL

In [34]:
subset_te_o = df_eval_o[df_eval_o["True Labels"].apply(lambda x: x in df_te["True Labels"].values)]
subset_t_o  = df_eval_o[df_eval_o["True Labels"].apply(lambda x: x in df_t["True Labels"].values)]

# CS Turkish–English: EM / PM
em_te_o, pm_te_o = em_pm_cs(subset_te_o["True Labels"], subset_te_o["Pred"])

# Calculate FP (for CS only)
fp_te_o = count_fp(subset_te_o["True Labels"], subset_te_o["Pred"])

# Mono Turkish: EM / PM
em_t_o, pm_t_o = em_pm_mono(subset_t_o["True Labels"], subset_t_o["Pred"])

# Output hasil
print(f"#EM: {em_te_o}")
print(f"#PM: {pm_te_o}")
print(f"#FP: {fp_te_o}")

#EM: 0
#PM: 318
#FP: 3


BASQ-SPANISH EVAL

In [22]:
#Making Subset to Filter Only ["True Labels"] and its ["Pred"] from df_eval
subset_bs_o = df_eval_o[df_eval_o["True Labels"].apply(lambda x: x in df_bs["True Labels"].values)]  # CS Basque–Spanish
subset_b_o  = df_eval_o[df_eval_o["True Labels"].apply(lambda x: x in df_b["True Labels"].values)]   # Mono Basque
subset_s_o  = df_eval_o[df_eval_o["True Labels"].apply(lambda x: x in df_s["True Labels"].values)]   # Mono Spanish

#CS Basque-Spanish EM/PM
em_bs_o, pm_bs_o = em_pm_cs(subset_bs_o["True Labels"], subset_bs_o["Pred"])

# Calculate FP (for CS only)
fp_bs_o = count_fp(subset_bs_o["True Labels"], subset_bs_o["Pred"])

#Mono Basque EM/PM
em_b_o, pm_b_o = em_pm_mono(subset_b_o["True Labels"], subset_b_o["Pred"])

#Mono Spanish EM/PM
em_s_o, pm_s_o = em_pm_mono(subset_s_o["True Labels"], subset_s_o["Pred"])

print(f"#EM: {em_bs_o}")
print(f"#PM: {pm_bs_o}")
print(f"#FP: {fp_bs_o}")

#EM: 9
#PM: 423
#FP: 6


HINDI-ENGLISH EVAL

In [23]:
subset_he_o = df_eval_o[df_eval_o["True Labels"].apply(lambda x: x in df_he["True Labels"].values)]  
subset_h_o  = df_eval_o[df_eval_o["True Labels"].apply(lambda x: x in df_h["True Labels"].values)]   

# EM / PM for CS Hindi–English
em_he_o, pm_he_o = em_pm_cs(subset_he_o["True Labels"], subset_he_o["Pred"])

# Calculate FP (for CS only)
fp_he_o = count_fp(subset_he_o["True Labels"], subset_he_o["Pred"])

# EM / PM for mono Hindi
em_h_o, pm_h_o = em_pm_mono(subset_h_o["True Labels"], subset_h_o["Pred"])

print(f"#EM: {em_he_o}")
print(f"#PM: {pm_he_o}")
# print(f"#FP: {fp_he}")

#EM: 0
#PM: 80


NEPALI-ENGLISH EVAL

In [24]:
subset_ne_o = df_eval_o[df_eval_o["True Labels"].apply(lambda x: x in df_ne["True Labels"].values)]   # CS Nepali–English
subset_n_o  = df_eval_o[df_eval_o["True Labels"].apply(lambda x: x in df_n["True Labels"].values)]    # Mono Nepali

# EM / PM for CS Nepali–English
em_ne_o, pm_ne_o = em_pm_cs(subset_ne_o["True Labels"], subset_ne_o["Pred"])

# FP from mono Nepali/English predicted to be CS pair
fp_ne_o = count_fp(subset_ne_o["True Labels"], subset_ne_o["Pred"])

# EM/PM for Nepali mono
em_n_o, pm_n_o = em_pm_mono(subset_n_o["True Labels"], subset_n_o["Pred"])

print(f"#EM: {em_he_o}")
print(f"#PM: {pm_he_o}")
# print(f"#FP: {fp_he}")

#EM: 0
#PM: 80


2. GlotLID ONLY

In [25]:
#Cutom Model Class for Customizing GlotLID
import fasttext
import numpy as np
class CustomLID:
    def __init__(self, model_path, languages = -1, mode='before'):
        self.model = fasttext.load_model(model_path)
        self.output_matrix = self.model.get_output_matrix()
        self.labels = self.model.get_labels()
        
        # compute language_indices
        if languages !=-1 and isinstance(languages, list):
            self.language_indices = [self.labels.index(l) for l in list(set(languages)) if l in self.labels]

        else:
            self.language_indices = list(range(len(self.labels)))

        # limit labels to language_indices
        self.labels = list(np.array(self.labels)[self.language_indices])
        
        # predict
        self.predict = self.predict_limit_after_softmax if mode=='after' else self.predict_limit_before_softmax

    
    def predict_limit_before_softmax(self, text, k=1):
        
        # sentence vector
        sentence_vector = self.model.get_sentence_vector(text)
        
        # dot
        result_vector = np.dot(self.output_matrix[self.language_indices, :], sentence_vector)

        # softmax
        softmax_result = np.exp(result_vector - np.max(result_vector)) / np.sum(np.exp(result_vector - np.max(result_vector)))

        # top k predictions
        top_k_indices = np.argsort(softmax_result)[-k:][::-1]
        top_k_labels = [self.labels[i] for i in top_k_indices]
        top_k_probs = softmax_result[top_k_indices]

        return tuple(top_k_labels), top_k_probs


    def predict_limit_after_softmax(self, text, k=1):
        
        # sentence vector
        sentence_vector = self.model.get_sentence_vector(text)
        
        # dot
        result_vector = np.dot(self.output_matrix, sentence_vector)

        # softmax
        softmax_result = np.exp(result_vector - np.max(result_vector)) / np.sum(np.exp(result_vector - np.max(result_vector)))

        # limit softmax to language_indices
        softmax_result = softmax_result[self.language_indices]

        # top k predictions
        top_k_indices = np.argsort(softmax_result)[-k:][::-1]
        top_k_labels = [self.labels[i] for i in top_k_indices]
        top_k_probs = softmax_result[top_k_indices]

        return tuple(top_k_labels), top_k_probs

In [26]:
#list 200 restricted languages for GlotLID
flores_glotlid = ['__label__eng_Latn', '__label__arb_Arab', '__label__rus_Cyrl', '__label__por_Latn', '__label__pol_Latn', '__label__ekk_Latn', '__label__ell_Grek', '__label__slk_Latn', '__label__slv_Latn', '__label__nld_Latn', '__label__lvs_Latn', '__label__hun_Latn', '__label__dan_Latn', '__label__swe_Latn', '__label__lit_Latn', '__label__fin_Latn', '__label__mlt_Latn', '__label__cmn_Hani', '__label__nob_Latn', '__label__kor_Hang', '__label__ind_Latn', '__label__uzn_Latn', '__label__fil_Latn', '__label__ukr_Cyrl', '__label__hin_Deva', '__label__hin_Latn', '__label__afr_Latn', '__label__mar_Deva', '__label__ceb_Latn', '__label__ilo_Latn', '__label__zul_Latn', '__label__heb_Hebr', '__label__xho_Latn', '__label__vie_Latn', '__label__jpn_Jpan', '__label__guj_Gujr', '__label__hrv_Latn', '__label__tur_Latn', '__label__nya_Latn', '__label__tsn_Latn', '__label__sna_Latn', '__label__tso_Latn', '__label__tha_Thai', '__label__spa_Latn', '__label__deu_Latn', '__label__eus_Latn', '__label__bul_Cyrl', '__label__amh_Ethi', '__label__fra_Latn', '__label__ewe_Latn', '__label__mkd_Cyrl', '__label__nso_Latn', '__label__tam_Taml', '__label__lin_Latn', '__label__twi_Latn', '__label__yor_Latn', '__label__als_Latn', '__label__ibo_Latn', '__label__ben_Beng', '__label__ita_Latn', '__label__tpi_Latn', '__label__azj_Latn', '__label__run_Latn', '__label__mya_Mymr', '__label__kin_Latn', '__label__ron_Latn', '__label__ces_Latn', '__label__kat_Geor', '__label__urd_Arab', '__label__zsm_Latn', '__label__pap_Latn', '__label__bem_Latn', '__label__mal_Mlym', '__label__kir_Cyrl', '__label__hye_Armn', '__label__smo_Latn', '__label__sin_Sinh', '__label__fij_Latn', '__label__kan_Knda', '__label__pan_Guru', '__label__hau_Latn', '__label__epo_Latn', '__label__gaz_Latn', '__label__tir_Ethi', '__label__bos_Latn', '__label__srp_Cyrl', '__label__hat_Latn', '__label__pag_Latn', '__label__lua_Latn', '__label__war_Latn', '__label__tel_Telu', '__label__tat_Cyrl', '__label__sag_Latn', '__label__lug_Latn', '__label__tum_Latn', '__label__swh_Latn', '__label__umb_Latn', '__label__som_Latn', '__label__gle_Latn', '__label__kng_Latn', '__label__mos_Latn', '__label__lus_Latn', '__label__khk_Cyrl', '__label__asm_Beng', '__label__tuk_Latn', '__label__quy_Latn', '__label__ayr_Latn', '__label__luo_Latn', '__label__tgk_Cyrl', '__label__cat_Latn', '__label__ssw_Latn', '__label__nno_Latn', '__label__cym_Latn', '__label__kik_Latn', '__label__kmb_Latn', '__label__ory_Orya', '__label__bel_Cyrl', '__label__bho_Deva', '__label__apc_Arab', '__label__bak_Cyrl', '__label__jav_Latn', '__label__yue_Hani', '__label__pbt_Arab', '__label__khm_Khmr', '__label__npi_Deva', '__label__npi_Latn', '__label__gug_Latn', '__label__uig_Arab', '__label__fur_Latn', '__label__kbp_Latn', '__label__hne_Deva', '__label__kam_Latn', '__label__gla_Latn', '__label__kab_Latn', '__label__arz_Arab', '__label__kaz_Cyrl', '__label__mri_Latn', '__label__lim_Latn', '__label__srd_Latn', '__label__sun_Latn', '__label__plt_Latn', '__label__mni_Beng', '__label__isl_Latn', '__label__vec_Latn', '__label__glg_Latn', '__label__scn_Latn', '__label__fao_Latn', '__label__san_Deva', '__label__ltz_Latn', '__label__cjk_Latn', '__label__ast_Latn', '__label__lmo_Latn', '__label__szl_Latn', '__label__oci_Latn', '__label__fon_Latn', '__label__min_Latn', '__label__wol_Latn', '__label__lij_Latn', '__label__ajp_Arab', '__label__snd_Arab', '__label__dik_Latn', '__label__ary_Arab', '__label__lao_Laoo', '__label__ars_Arab', '__label__bjn_Latn', '__label__shn_Mymr', '__label__crh_Latn', '__label__aeb_Arab', '__label__ace_Latn', '__label__ckb_Arab', '__label__dyu_Latn', '__label__ltg_Latn', '__label__kmr_Latn', '__label__ban_Latn', '__label__mai_Deva', '__label__fuv_Latn', '__label__kac_Latn', '__label__taq_Latn', '__label__bam_Latn', '__label__sat_Olck', '__label__tzm_Tfng', '__label__bug_Latn', '__label__dzo_Tibt', '__label__kas_Deva', '__label__fas_Arab', '__label__nus_Latn', '__label__knc_Latn', '__label__mag_Deva', '__label__taq_Tfng', '__label__kas_Arab', '__label__knc_Arab', '__label__bjn_Arab', '__label__ace_Arab', '__label__kea_Latn', '__label__awa_Deva', '__label__acm_Arab', '__label__bod_Tibt', '__label__sot_Latn', '__label__ydd_Hebr', '__label__azb_Arab']

#custom model from limited languages
glot_model_lim = CustomLID(glot_model_path, languages = flores_glotlid, mode='before')

def glot_lid(text):
    labels, probs = glot_model_lim.predict(text, k=2)
    
    return {label for label,prob in zip(labels,probs) if prob>= 0.3}
    
def apply_glot_lid(df):
            
    df['Pred'] = df['Text'].apply(glot_lid)
        
    return df

In [27]:
# Apply model to all data
df_eval_g = apply_glot_lid(df_eval_g)  # results are stored in 'Predicted Labels'

# Normalize the ground truth label column to set
df_eval_g["True Labels"] = df_eval_g["True Labels"].apply(lambda x: set(x) if isinstance(x, list) or isinstance(x, set) else set(x.split()))

MONO-ENGLISH

In [28]:
subset_e_g  = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_e["True Labels"].values)]    # Mono English

em_e_g, pm_e_g = em_pm_mono(subset_e_g["True Labels"], subset_e_g["Pred"])

# Output hasil
print(f"#EM: {em_e_g}")
print(f"#PM: {pm_e_g}")

#EM: 460
#PM: 463


TURKISH-ENGLISH EVAL

In [29]:
subset_te_g = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_te["True Labels"].values)]
subset_t_g  = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_t["True Labels"].values)]

# CS Turkish–English: EM / PM
em_te_g, pm_te_g = em_pm_cs(subset_te_g["True Labels"], subset_te_g["Pred"])

# FP Turkish–English
fp_te_g = count_fp(subset_te_g["True Labels"], subset_te_g["Pred"])

# Mono Turkish: EM / PM
em_t_g, pm_t_g = em_pm_mono(subset_t_g["True Labels"], subset_t_g["Pred"])

# Output hasil
print(f"#EM: {em_te_g}")
print(f"#PM: {pm_te_g}")
print(f"#FP: {fp_te_g}")

#EM: 4
#PM: 330
#FP: 3


BASQUE-SPANISH EVAL

In [30]:
subset_bs_g = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_bs["True Labels"].values)]
subset_b_g  = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_b["True Labels"].values)]
subset_s_g  = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_s["True Labels"].values)]

# CS Basque–Spanish: EM / PM
em_bs_g, pm_bs_g = em_pm_cs(subset_bs_g["True Labels"], subset_bs_g["Pred"])

#FP
fp_bs_g = count_fp(subset_bs_g["True Labels"], subset_bs_g["Pred"])

# Mono Basque: EM / PM
em_b_g, pm_b_g = em_pm_mono(subset_b_g["True Labels"], subset_b_g["Pred"])

# Mono Spanish: EM / PM
em_s_g, pm_s_g = em_pm_mono(subset_s_g["True Labels"], subset_s_g["Pred"])

# Output hasil
print(f"#EM: {em_bs_g}")
print(f"#PM: {pm_bs_g}")
print(f"#FP: {fp_bs_g}")

#EM: 9
#PM: 430
#FP: 1


HINDI_ENGLISH EVAL

In [31]:
subset_he_g = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_he["True Labels"].values)]
subset_h_g  = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_h["True Labels"].values)]
subset_e_g  = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_e["True Labels"].values)]

# EM / PM untuk CS Hindi–English
em_he_g, pm_he_g = em_pm_cs(subset_he_g["True Labels"], subset_he_g["Pred"])

# FP 
fp_he_g = count_fp(subset_he_g["True Labels"], subset_he_g["Pred"])

# Mono Hindi: EM / PM
em_h_g, pm_h_g = em_pm_mono(subset_h_g["True Labels"], subset_h_g["Pred"])

# Output hasil
print(f"#EM: {em_he_g}")
print(f"#PM: {pm_he_g}")
print(f"#FP: {fp_he_g}")

#EM: 0
#PM: 84
#FP: 4


NEPALI-ENGLISH EVAL

In [32]:
subset_ne_g = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_ne["True Labels"].values)]
subset_n_g  = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_n["True Labels"].values)]
subset_e_g  = df_eval_g[df_eval_g["True Labels"].apply(lambda x: x in df_e["True Labels"].values)]

# EM / PM untuk CS Nepali–English
em_ne_g, pm_ne_g = em_pm_cs(subset_ne_g["True Labels"], subset_ne_g["Pred"])

# FP 
fp_ne_g = count_fp(subset_ne_g["True Labels"], subset_ne_g["Pred"])
# Mono Nepali: EM / PM

em_n_g, pm_n_g = em_pm_mono(subset_n_g["True Labels"], subset_n_g["Pred"])

# Output hasil
print(f"#EM: {em_ne_g}")
print(f"#PM: {pm_ne_g}")
print(f"#FP: {fp_ne_g}")

#EM: 0
#PM: 118
#FP: 7


3. OpenLID with MaskLID

In [36]:
# masklid is the name of masklid.py
from masklid import MaskLID


om_model = MaskLID(glot_model_path, languages = -1)

def om_lid(text):
    prediction = om_model.predict_codeswitch(text, beta = 15 , alpha = 3, max_lambda = 2, min_length = 20, min_prob = 0.90, max_retry=3, alpha_step_increase = 3, beta_step_increase = 5)
    
    return set(prediction.keys())
    
def apply_om_lid(df):
    df['Pred'] = df['Text'].apply(om_lid)
        
    return df



In [37]:
# Apply model to all data
df_eval_om = apply_om_lid(df_eval_om)  # results are stored in 'Predicted Labels'

# Normalize the ground truth label column to set
df_eval_om["True Labels"] = df_eval_om["True Labels"].apply(lambda x: set(x) if isinstance(x, list) or isinstance(x, set) else set(x.split()))

text= yarın bir status yapıp işlerin üstünden geçelim label= __label__tur_Latn (new text), dict data= yarın status üstünden geçelim dict_mask= yarın bir yapıp işlerin üstünden geçelim 0 0
text= yarın status üstünden geçelim label= __label__tur_Latn (new text), dict data= yarın status üstünden geçelim dict_mask= yarın üstünden geçelim 1 0
text= ben dataları sort ettim peki nedir bu işin process'i schedule ettik biz sizi briefing room'da bekliyor olacağım onu yapıyor olman lazım zaten label= __label__tur_Latn (new text), dict data= ben dataları sort peki bu process'i schedule biz sizi briefing room'da onu olman lazım zaten dict_mask= dataları ettim nedir işin ettik room'da bekliyor olacağım yapıyor olman lazım zaten 0 0
text= ben dataları sort peki bu process'i schedule biz sizi briefing room'da onu olman lazım zaten label= __label__tur_Latn (new text), dict data= ben dataları sort peki bu process'i schedule biz sizi briefing room'da onu olman lazım zaten dict_mask= dataları room'da olma

MONO-ENGLISH EVAL

In [38]:
subset_e_om  = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_e["True Labels"].values)]    # Mono English

em_e_om, pm_e_om = em_pm_mono(subset_e_om["True Labels"], subset_e_om["Pred"])

# Output hasil
print(f"#EM: {em_e_om}")
print(f"#PM: {pm_e_om}")

#EM: 447
#PM: 449


TURKISH-ENGLISH EVAL

In [36]:
subset_te_om = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_te["True Labels"].values)]
subset_t_om  = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_t["True Labels"].values)]

# CS Turkish–English: EM / PM
em_te_om, pm_te_om = em_pm_cs(subset_te_om["True Labels"], subset_te_om["Pred"])

# FP 
fp_te_om = count_fp(subset_te_om["True Labels"], subset_te_om["Pred"])

# Mono Turkish: EM / PM
em_t_om, pm_t_om = em_pm_mono(subset_t_om["True Labels"], subset_t_om["Pred"])

# Output result
print(f"#EM: {em_te_om}")
print(f"#PM: {pm_te_om}")
print(f"#FP: {fp_te_om}")

#EM: 58
#PM: 301
#FP: 32


BASQUE-SPANISH EVAL

In [37]:
subset_bs_om = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_bs["True Labels"].values)]
subset_b_om  = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_b["True Labels"].values)]
subset_s_om  = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_s["True Labels"].values)]

# CS Basque–Spanish: EM / PM
em_bs_om, pm_bs_om = em_pm_cs(subset_bs_om["True Labels"], subset_bs_om["Pred"])

# FP 
fp_bs_om = count_fp(subset_bs_om["True Labels"], subset_bs_om["Pred"])

# Mono Basque: EM / PM
em_b_om, pm_b_om = em_pm_mono(subset_b_om["True Labels"], subset_b_om["Pred"])

# Mono Spanish: EM / PM
em_s_om, pm_s_om = em_pm_mono(subset_s_om["True Labels"], subset_s_om["Pred"])

print(f"#EM: {em_bs_om}")
print(f"#PM: {pm_bs_om}")
print(f"#FP: {fp_bs_om}")

#EM: 55
#PM: 413
#FP: 18


HINDI-ENGLISH EVAL

In [38]:
subset_he_om = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_he["True Labels"].values)]
subset_h_om  = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_h["True Labels"].values)]
subset_e_om  = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_e["True Labels"].values)]

# EM / PM untuk CS Hindi–English
em_he_om, pm_he_om = em_pm_cs(subset_he_om["True Labels"], subset_he_om["Pred"])

# FP 
fp_he_om = count_fp(subset_he_om["True Labels"], subset_he_om["Pred"])

# Mono Hindi: EM / PM
em_h_om, pm_h_om = em_pm_mono(subset_h_om["True Labels"], subset_h_om["Pred"])

print(f"#EM: {em_he_om}")
print(f"#PM: {pm_he_om}")
print(f"#FP: {fp_he_om}")

#EM: 0
#PM: 81
#FP: 14


NEPALI-ENGLISH EVAL

In [39]:
subset_ne_om = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_ne["True Labels"].values)]
subset_n_om  = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_n["True Labels"].values)]
subset_e_om  = df_eval_om[df_eval_om["True Labels"].apply(lambda x: x in df_e["True Labels"].values)]

# EM / PM untuk CS Nepali–English
em_ne_om, pm_ne_om = em_pm_cs(subset_ne_om["True Labels"], subset_ne_om["Pred"])

# FP 
fp_ne_om = count_fp(subset_ne_om["True Labels"], subset_ne_om["Pred"])

# Mono Nepali: EM / PM
em_n_om, pm_n_om = em_pm_mono(subset_n_om["True Labels"], subset_n_om["Pred"])

print(f"#EM: {em_ne_om}")
print(f"#PM: {pm_ne_om}")
print(f"#FP: {fp_ne_om}")

#EM: 0
#PM: 83
#FP: 37


4.GlotLID with MaskLID

In [40]:
# masklid is the name of masklid.py
from masklid import MaskLID


gm_model = MaskLID(glot_model_path, languages = flores_glotlid)

def gm_lid(text):
    
    prediction = gm_model.predict_codeswitch(text, beta = 15 , alpha = 3, max_lambda = 2, min_length = 20, min_prob = 0.90, max_retry=3, alpha_step_increase = 3, beta_step_increase = 5)
    
    return set(prediction.keys())
    
def apply_gm_lid(df):
            
    df['Pred'] = df['Text'].apply(gm_lid)
        
    return df

In [41]:
# Apply to all data
df_eval_gm = apply_gm_lid(df_eval_gm)  

# Normalize the ground truth label column to set
df_eval_gm["True Labels"] = df_eval_gm["True Labels"].apply(lambda x: set(x) if isinstance(x, list) or isinstance(x, set) else set(x.split()))

ENGLISH-ONLY EVAL

In [42]:
subset_e_gm  = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_e["True Labels"].values)]
em_e_gm, pm_e_gm = em_pm_mono(subset_e_gm["True Labels"], subset_e_gm["Pred"])

print(f"#EM: {em_e_gm}")
print(f"#PM: {pm_e_gm}")

#EM: 450
#PM: 464


TURKISH-ENGLISH EVAL

In [43]:
subset_te_gm = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_te["True Labels"].values)]
subset_t_gm  = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_t["True Labels"].values)]

em_te_gm, pm_te_gm = em_pm_cs(subset_te_gm["True Labels"], subset_te_gm["Pred"])

fp_te_gm = count_fp(subset_te_gm["True Labels"], subset_te_gm["Pred"])

em_t_gm, pm_t_gm = em_pm_mono(subset_t_gm["True Labels"], subset_t_gm["Pred"])

print(f"#EM: {em_te_gm}")
print(f"#PM: {pm_te_gm}")
print(f"#FP: {fp_te_gm}")

#EM: 74
#PM: 315
#FP: 19


BASQ-SPANISH EVAL

In [44]:
subset_bs_gm = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_bs["True Labels"].values)]
subset_b_gm  = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_b["True Labels"].values)]
subset_s_gm  = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_s["True Labels"].values)]

em_bs_gm, pm_bs_gm = em_pm_cs(subset_bs_gm["True Labels"], subset_bs_gm["Pred"])

fp_bs_gm = count_fp(subset_bs_gm["True Labels"], subset_bs_gm["Pred"])

em_b_gm, pm_b_gm = em_pm_mono(subset_b_gm["True Labels"], subset_b_gm["Pred"])

em_s_gm, pm_s_gm = em_pm_mono(subset_s_gm["True Labels"], subset_s_gm["Pred"])

print(f"#EM: {em_bs_gm}")
print(f"#PM: {pm_bs_gm}")
print(f"#FP: {fp_bs_gm}")

#EM: 47
#PM: 423
#FP: 10


HINDI-ENGLISH EVAL

In [45]:
subset_he_gm = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_he["True Labels"].values)]
subset_h_gm  = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_h["True Labels"].values)]
subset_e_gm  = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_e["True Labels"].values)]

em_he_gm, pm_he_gm = em_pm_cs(subset_he_gm["True Labels"], subset_he_gm["Pred"])

fp_he_gm = count_fp(subset_he_gm["True Labels"], subset_he_gm["Pred"])

em_h_gm, pm_h_gm = em_pm_mono(subset_h_gm["True Labels"], subset_h_gm["Pred"])

print(f"#EM: {em_he_gm}")
print(f"#PM: {pm_he_gm}")
print(f"#FP: {fp_he_gm}")

#EM: 0
#PM: 76
#FP: 23


NEPALI-ENGLISH EVAL

In [46]:
subset_ne_gm = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_ne["True Labels"].values)]
subset_n_gm  = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_n["True Labels"].values)]
subset_e_gm  = df_eval_gm[df_eval_gm["True Labels"].apply(lambda x: x in df_e["True Labels"].values)]

em_ne_gm, pm_ne_gm = em_pm_cs(subset_ne_gm["True Labels"], subset_ne_gm["Pred"])

fp_ne_gm = count_fp(subset_ne_gm["True Labels"], subset_ne_gm["Pred"])

em_n_gm, pm_n_gm = em_pm_mono(subset_n_gm["True Labels"], subset_n_gm["Pred"])

print(f"#EM: {em_ne_gm}")
print(f"#PM: {pm_ne_gm}")
print(f"#FP: {fp_ne_gm}")

#EM: 0
#PM: 115
#FP: 25


FINAL EVAL

In [48]:
import pandas as pd
from IPython.display import display

# Urutan set
sets = [
    "CS Turkish-English", "CS Basq Spanish", "CS Hindi English", "CS Nepali English",
    "Single Basq", "Single Spanish", "Single Turkish", "Single Hindi", "Single Nepali", "Single English"
]
s_values = [len(df_te), len(df_bs), len(df_he), len(df_ne), len(df_b), len(df_s), len(df_t), len(df_h), len(df_n), len(df_e)]

# MultiIndex columns
columns = pd.MultiIndex.from_tuples([
    ("", "Set"),
    ("", "#S"),
    ("Baseline + MaskLID", "GlotLID EM/PM"),
    ("Baseline + MaskLID", "OpenLID EM/PM"),
    ("Baseline", "GlotLID & GlotLID #FP"),
    ("Baseline", "OpenLID & MaskLID #FP"),
    ("Baseline", "GlotLID EM/PM"),
    ("Baseline", "OpenLID EM/PM"),
    ("Baseline", "OpenLID #FP"),
    ("Baseline", "GlotLID #FP"),
])

# Baris data: 10 baris sesuai sets
rows = list(zip(
    sets,
    s_values,
    
    # GlotLID + MaskLID EM/PM
    [f"{em_te_gm}/{pm_te_gm}", f"{em_bs_gm}/{pm_bs_gm}", f"{em_he_gm}/{pm_he_gm}", f"{em_ne_gm}/{pm_ne_gm}",
     f"{em_b_gm}/{pm_b_gm}", f"{em_s_gm}/{pm_s_gm}", f"{em_t_gm}/{pm_t_gm}", f"{em_h_gm}/{pm_h_gm}", f"{em_n_gm}/{pm_n_gm}", f"{em_e_gm}/{pm_e_gm}"],

    # OpenLID + MaskLID EM/PM
    [f"{em_te_om}/{pm_te_om}", f"{em_bs_om}/{pm_bs_om}", f"{em_he_om}/{pm_he_om}", f"{em_ne_om}/{pm_ne_om}",
     f"{em_b_om}/{pm_b_om}", f"{em_s_om}/{pm_s_om}", f"{em_t_om}/{pm_t_om}", f"{em_h_om}/{pm_h_om}", f"{em_n_om}/{pm_n_om}", f"{em_e_om}/{pm_e_om}"],

    # GlotLID FP (GlotLID + MaskLID pipeline)
    [fp_te_gm, fp_bs_gm, fp_he_gm, fp_ne_gm, fp_bs_gm],

    # OpenLID FP (OpenLID + MaskLID pipeline)
    [fp_te_om, fp_bs_om, fp_he_om, fp_ne_om, fp_bs_om],

    # GlotLID EM/PM (tanpa MaskLID)
    [f"{em_te_g}/{pm_te_g}", f"{em_bs_g}/{pm_bs_g}", f"{em_he_g}/{pm_he_g}", f"{em_ne_g}/{pm_ne_g}",
     f"{em_b_g}/{pm_b_g}", f"{em_s_g}/{pm_s_g}", f"{em_t_g}/{pm_t_g}", f"{em_h_g}/{pm_h_g}", f"{em_n_g}/{pm_n_g}", f"{em_e_g}/{pm_e_g}"],

    # OpenLID EM/PM (tanpa MaskLID)
    [f"{em_te_o}/{pm_te_o}", f"{em_bs_o}/{pm_bs_o}", f"{em_he_o}/{pm_he_o}", f"{em_ne_o}/{pm_ne_o}",
     f"{em_b_o}/{pm_b_o}", f"{em_s_o}/{pm_s_o}", f"{em_t_o}/{pm_t_o}", f"{em_h_o}/{pm_h_o}", f"{em_n_o}/{pm_n_o}", f"{em_e_o}/{pm_e_o}"],

    # OpenLID #FP
    [fp_te_o, fp_bs_o, fp_he_o, fp_ne_o, fp_bs_o],

    # GlotLID #FP
    [fp_te_g, fp_bs_g, fp_he_g, fp_ne_g, fp_bs_g]
))

# Buat dan tampilkan DataFrame
df_paper_style = pd.DataFrame(rows, columns=columns)
pd.set_option("display.width", 200)
display(df_paper_style)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Baseline + MaskLID,Baseline + MaskLID,Baseline,Baseline,Baseline,Baseline,Baseline,Baseline
Unnamed: 0_level_1,Set,#S,GlotLID EM/PM,OpenLID EM/PM,GlotLID & GlotLID #FP,OpenLID & MaskLID #FP,GlotLID EM/PM,OpenLID EM/PM,OpenLID #FP,GlotLID #FP
0,CS Turkish-English,339,74/315,58/301,19,32,4/330,4/323,8,3
1,CS Basq Spanish,446,47/423,55/413,10,18,9/430,9/423,6,1
2,CS Hindi English,235,0/76,0/81,23,14,0/84,0/80,2,4
3,CS Nepali English,623,0/115,0/83,25,37,0/118,0/93,4,7
4,Single Basq,357,354/354,355/355,10,18,353/353,355/355,6,1
