In [1]:
import fasttext
import numpy as np
from huggingface_hub import hf_hub_download
import pandas as pd
import json
from pprint import pprint, pformat
from gurobipy import Model, GRB
from gurobipy import quicksum

In [2]:
#Load GlotLID Model
glot_model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir=None)
glot_model = fasttext.load_model(glot_model_path)

In [3]:
#Load OpenLID Model
open_model_path = hf_hub_download(repo_id="laurievb/OpenLID", filename="model.bin")
open_model = fasttext.load_model(open_model_path)

In [4]:
import fasttext
import numpy as np
import re
import string
from copy import deepcopy

class MaskLID:
    def __init__ (self, model_path, languages=-1):
        self.model = fasttext.load_model(model_path)
        self.output_matrix = self.model.get_output_matrix()
        self.labels = self.model.get_labels()
        self.language_indices = self._compute_language_indices(languages) #menyimpan index label/bahasa
        self.labels = [self.labels[i] for i in self.language_indices] #mengembalikan label/bahasa untuk seluruh index di self.language_indices

    def _compute_language_indices(self, languages): #mengembalikan list index untuk label bahasa yang ingin dipakai
        if languages != -1 and isinstance(languages, list): #cek apakah languanges bukan -1 dan benar-benar berupa list
                return [self.labels.index(l) for l in set(languages) if l in self.labels] #ini return kalo languanges paramnya bukan -1
        return list(range(len(self.labels))) #ini return kalo languanges paramnya = -1

    def _softmax(self, x): #x itu array dot product antara vektor bahasa dan rata2 vektor kalimat
        exp_x = np.exp(x - np.max(x)) #setiap nilai di array x dikurang dengan nilai max di dalam array itu supaya gak overflow
        return exp_x /np.sum(exp_x)

    def _normalize_text(self, text): #intinya ini biar input kalimat bersih
        replace_by = " "
        replacement_map = {ord(c): replace_by for c in '\n_:' + '•#{|}' + string.digits} #menarik buat dipelajari nanti, ini buat dictionary utuk mapping tags2 aneh dan menggantinya jadi " "
        text = text.translate(replacement_map)
        return re.sub(r'\s+', ' ', text).strip()

    #ini untuk menemukan bahasa dominan di kalimat itu, makanya k=1 (cuman buat nentuin bahasa apa yg mau di mask)
    def predict(self, text, k=1): #inget ini cuman kembaliin top-k prediksi
        sentence_vector = self.model.get_sentence_vector(text)
        result_vector = np.dot(self.output_matrix, sentence_vector) #logit skor (skor logit tiap bahasa)
        softmax_result = self._softmax(result_vector)[self.language_indices] #ambil skor logit hanya untuk bahasa yang dipilih (index bahasa di self.language_indices)
        top_k_indices = np.argsort(softmax_result)[-k:][::-1] #ambil k index terakhir (yg terbesar) lalu balik urutannya [::-1] biar dari besar ke kecil
        top_k_labels = [self.labels[i] for i in top_k_indices] #ambil label bahasanya
        top_k_probs = softmax_result[top_k_indices] #ambil probabilitasnya
        return tuple(top_k_labels), top_k_probs

    #mirip seperti predicct tapi lebih ke analitik
    def compute_v(self, sentence_vector):
        result_vector = np.dot(self.output_matrix[self.language_indices, :], sentence_vector)  #hitung logit skor bahasa yang dipilih seperti biasa
        return sorted(zip(self.labels, result_vector), key=lambda x:x[1], reverse=True) #Lalu urutkan berdasarkan skor (x[1]), dari tinggi ke rendah

    def compute_v_per_word(self, text): #ini intinya buat hitung vector per kata di kalimat, ini harusnya keurut (maksudnya terurut kata di kalimat sesuai kalimatnya) sih di dict_text 
        text = self._normalize_text(text)
        words = self.model.get_line(text)[0] #ambil bagian word/tokennya saja soalnya model.det_line itu mengembalikan [token,label]
        words = [w for w in words if w not in ['</s>', '</s>']] #ambil tokennya saja, buang bagian '</s>', '</s>'
        subword_ids = [self.model.get_subwords(sw)[1] for sw in words] #ambil id/index sw nya
        sentence_vector = [np.sum([self.model.get_input_vector(id) for id in sid], axis=0) for sid in subword_ids] #menghasilkan satu vektor representasi akhir untuk kata itu dengan menambahkan setiap subword vector dari kata itu

        #nilai logit tiap kata yang diurtkan dari besar ke kecil
        dict_text = {}
        for i, word in enumerate(words):
            key = f"{i}_{word}"
            dict_text[key] = {'logits': self.compute_v(sentence_vector[i])} #setiap satu key (index,kata) akan punya skor logit untuk bahasa di self.language_indices

        return dict_text

    
    def mask_label_top_k(self, dict_text, label, top_keep, top_remove): #ini kan tujuan intinya menghasilkan dict_remained sama dict_deleted (yg seharusnya di mask)
        dict_remained = deepcopy(dict_text)
        dict_deleted = {}

        for key, value in dict_text.items(): #jadi ini di iterasi per kata (setiap kata punya nilai logit untuk setiap bahasa)
            logits = value['logits'] 
            labels = [t[0] for t in logits]  
            
            if label in labels[:top_keep]: #kalau label target (sesuai parameter) ada di top_keep kata itu, simpan kata itu di dict_deleted
                dict_deleted[key] = dict_remained[key]

            if label in labels[:top_remove]:
                dict_remained.pop(key, None)

        return dict_remained, dict_deleted

    @staticmethod
    def get_sizeof(text):
        return len(text.encode('utf-8'))

    @staticmethod
    def custom_sort(word): #ini dipake buat dapetin angka di depan word aja? gatau juga buat apa
        match = re.match(r'^(\d+)_', word) 
        if match:
            return int(match.group(1)) ##Ekstrak angka sebelum underscore (_)/grup pertama
        else:
            return float('inf') # Return infinity for words without numbers at the beginning

    def sum_logits(self, dict_data, label): #ini gatau kenapa dia sum skor logit dari label yang dicari (misalnya '__label__eng_Latn'), dan tambah ke total .
        total = 0
        
        for value in dict_data.values():
            logits = value['logits']
            labels = [t[0] for t in logits]
            if label in labels:
                total += logits[labels.index(label)][1]
        return total

    
    def predict_codeswitch(self, text, beta, alpha, min_prob, min_length, max_lambda=1, max_retry=3, alpha_step_increase=5, beta_step_increase=5):

        info = {} #untuk menyimpan hasil masking yang sukses
        index = 0 #menghitung berapa kali masking berhasil
        retry = 0 #menghitung berapa kali masking gagal (tidak memenuhi syarat confidence/panjang)

        
        #hitung embedding per kata dari text, untuk tiap bahasa yang didukung model
        #Output: dictionary berisi representasi vektor kata-kata, dikunci dengan nama seperti 0_Bonjour, 1_i, dll.
        dict_data = self.compute_v_per_word(text)


        #while akan berlanjut kalo dua kondisi dibawah memenuhi, kalo satu ga memenuhi, ga lanjut
        while index < max_lambda and retry < max_retry:
            print(f"\n--- iteration : {index + retry + 1}---")
            print(f"Current index: {index}, retry: {retry}")
            print(f"Current beta: {beta}, Current alpha: {alpha}")
            

            #prediksi bahasa untuk text saat ini.
            pred = self.predict(text, k=1)
            label = pred[0][0] # ambil label bahasa dominan (misal '__label__eng_Latn')
            score = pred[1][0]
            print(f"Predicted label (P(c|s)): ({label}, {score:.2f}) from averaging text: {text}") #label akan jadi target masking: kita ingin lihat apakah masking beberapa kata akan mengubah label ini.
            print(f"Current dict_data (Vs) : {' '.join(x.split('_', 1)[1] for x in dict_data.keys())}")
            #Simpan versi teks sebelum dimask, untuk rollback kalau masking gagal.
            prev_text = text

            dict_data, dict_masked = self.mask_label_top_k(dict_data, label, beta, alpha)
            

            masked_text = ' '.join(x.split('_', 1)[1] for x in dict_masked.keys()) #bagian kata-kata yang dianggap kontributor utama terhadap label dominan
            print(f"masked text (words that fall within the beta threshold) : {masked_text}")
            text = ' '.join(x.split('_',1)[1] for x in dict_data.keys()) #bagian sisa yang akan dipakai buat prediksi ulang → untuk lihat apakah label dominan masih bertahan setelah penghapusan sebagian kata
            print(f"text (words remaining, not fall within alpha): {text}")
            
            # cek apakah hasil masked_text masih cukup panjang untuk diproses,
            # atau ini adalah iterasi pertama (wajib diproses meski teks pendek)
            if self.get_sizeof(masked_text) > min_length or index == 0:
                print(f"-------------------------")
                print(f"if condition when masked text > {min_length} or index = 0")
                print(f"\tcheck masked_text length")
                print(f"\tmasked_text length: {self.get_sizeof(masked_text)}")
                #lakukan prediksi ulang pada teks yang sudah dimask
                #(Ini penting untuk lihat apakah masking tetap mempertahankan label dominan)
                temp_pred = self.predict(masked_text) #perhatiin ini ngepredict masked_text (ini bisa buat liat contigues dan sisa kata kayanya)
                print(f"\tpredicted label from masked text: {temp_pred}")
               
                #cek apakah confidence label top-1 tinggi (misalnya > 0.90)
                #cek apakah label top-1 == label yang dicari
                #iterasi pertama tetap disimpan meskipun confidence-nya kurang
                #Jika terpenuhi, artinya: masking tidak merusak label dominan, artinya bisa disimpan.
                if (temp_pred[1][0]> min_prob and temp_pred[0][0] == label) or index == 0:
                    print(f"\tif condition when : masked_text confidence label > {min_prob} AND its label=={label} OR index=0 ")
                    info[index]={
                        'label':label, #bahasa dominan saat ini 
                        'text': masked_text, #bagian teks yang dimask
                        'text_keys': dict_masked.keys(), #indeks+kata dari kata-kata yang dimask 
                        'size': self.get_sizeof(masked_text), #ukuran teks yang dimask (berapa banyak katanya)
                        'sum_logit': self.sum_logits(dict_masked, label) #ukuran teks yang dimask (berapa banyak katanya)
                    }
                    print(f"\t\tlook what info has :")
                    print(f"\t\ttext_keys (saving all maksed_text to info):")
                    for key in info[index]['text_keys']:
                        print(f"\t\t{key}")
                    print(f"\t\tsum logit: {info[index]['sum_logit']}")

                    index+= 1 #ukuran teks yang dimask (berapa banyak katanya)

                #kalau confidence untuk label top-1 kurang dari min_prob, atau
                #kalau label hasil masking berubah (tidak sama dengan label awal sebelum masking), atau
                #ini bukan iterasi pertama (index != 0)
                else:
                    print(f"\telse condition, when masked text: \nconfidence for top-1 label is less than min_prob AND masking result label changed\nOR\nthis is not the first iteration (index != 0)")
                    text = prev_text #kembalikan teks ke kondisi sebelum masking, artinya: hasil masking tadi dianggap buruk, jadi dibatalkan, dan teks dikembalikan ke versi sebelumnya.
                    print(f"\t\tRollback to previous sentence form (without masking)  : {text}")
                    beta += beta_step_increase #di iterasi berikutnya, sistem akan mencoba masking lebih banyak kata (alpha lebih besar)
                    alpha += alpha_step_increase #tapi juga menyimpan lebih banyak kata pendukung (beta lebih besar)
                    print(f"\t\tincrease beta to {beta} and alpha to {alpha}")
                    retry +=1 #Naikkan jumlah percobaan gagal, ini penting supaya loop tidak jalan terus. Jika retry mencapai max_retry, maka loop akan berhenti.
                    
            else: 
                # Kalau hasil masked_text terlalu pendek dan ini bukan iterasi pertama
                print(f"else condition, when : masked_text shorter than {min_length} OR index !=0")
                text = prev_text
                print(f"Rollback to previous sentence form (without masking): {text}")
                beta += beta_step_increase
                alpha += alpha_step_increase
                print(f"\tincrease beta to {beta} and alpha to {alpha}")
                retry += 1
                
            print(f"Text remaining to process: {text}")

            if self.get_sizeof(text) < min_length:
                print(f"Remaining text to process is to short, less than {min_length}")
                break

            if index == max_lambda or retry == max_retry:
                print(f"Loop stopped. Final status: index = {index}, retry = {retry}, max_lambda = {max_lambda}, max_retry = {max_retry}")
            else:
                print(f"Loop continue. Final status: index = {index}, retry = {retry}, max_lambda = {max_lambda}, max_retry = {max_retry}")
                
            

        #post-process
        post_info = {}
        for value in info.values():
            key = value['label'] 
            if key in post_info:
                post_info[key].extend(value['text_keys'])
            else:
                post_info[key] = list(value['text_keys'])
    
        for key in post_info:
            post_info[key] = ' '.join([x.split('_', 1)[1] for x in sorted(set(post_info[key]), key=self.custom_sort)])

        return post_info

In [5]:
flores_glotlid = ['__label__eng_Latn', '__label__arb_Arab', '__label__rus_Cyrl', '__label__por_Latn', '__label__pol_Latn', '__label__ekk_Latn', '__label__ell_Grek', '__label__slk_Latn', '__label__slv_Latn', '__label__nld_Latn', '__label__lvs_Latn', '__label__hun_Latn', '__label__dan_Latn', '__label__swe_Latn', '__label__lit_Latn', '__label__fin_Latn', '__label__mlt_Latn', '__label__cmn_Hani', '__label__nob_Latn', '__label__kor_Hang', '__label__ind_Latn', '__label__uzn_Latn', '__label__fil_Latn', '__label__ukr_Cyrl', '__label__hin_Deva', '__label__hin_Latn', '__label__afr_Latn', '__label__mar_Deva', '__label__ceb_Latn', '__label__ilo_Latn', '__label__zul_Latn', '__label__heb_Hebr', '__label__xho_Latn', '__label__vie_Latn', '__label__jpn_Jpan', '__label__guj_Gujr', '__label__hrv_Latn', '__label__tur_Latn', '__label__nya_Latn', '__label__tsn_Latn', '__label__sna_Latn', '__label__tso_Latn', '__label__tha_Thai', '__label__spa_Latn', '__label__deu_Latn', '__label__eus_Latn', '__label__bul_Cyrl', '__label__amh_Ethi', '__label__fra_Latn', '__label__ewe_Latn', '__label__mkd_Cyrl', '__label__nso_Latn', '__label__tam_Taml', '__label__lin_Latn', '__label__twi_Latn', '__label__yor_Latn', '__label__als_Latn', '__label__ibo_Latn', '__label__ben_Beng', '__label__ita_Latn', '__label__tpi_Latn', '__label__azj_Latn', '__label__run_Latn', '__label__mya_Mymr', '__label__kin_Latn', '__label__ron_Latn', '__label__ces_Latn', '__label__kat_Geor', '__label__urd_Arab', '__label__zsm_Latn', '__label__pap_Latn', '__label__bem_Latn', '__label__mal_Mlym', '__label__kir_Cyrl', '__label__hye_Armn', '__label__smo_Latn', '__label__sin_Sinh', '__label__fij_Latn', '__label__kan_Knda', '__label__pan_Guru', '__label__hau_Latn', '__label__epo_Latn', '__label__gaz_Latn', '__label__tir_Ethi', '__label__bos_Latn', '__label__srp_Cyrl', '__label__hat_Latn', '__label__pag_Latn', '__label__lua_Latn', '__label__war_Latn', '__label__tel_Telu', '__label__tat_Cyrl', '__label__sag_Latn', '__label__lug_Latn', '__label__tum_Latn', '__label__swh_Latn', '__label__umb_Latn', '__label__som_Latn', '__label__gle_Latn', '__label__kng_Latn', '__label__mos_Latn', '__label__lus_Latn', '__label__khk_Cyrl', '__label__asm_Beng', '__label__tuk_Latn', '__label__quy_Latn', '__label__ayr_Latn', '__label__luo_Latn', '__label__tgk_Cyrl', '__label__cat_Latn', '__label__ssw_Latn', '__label__nno_Latn', '__label__cym_Latn', '__label__kik_Latn', '__label__kmb_Latn', '__label__ory_Orya', '__label__bel_Cyrl', '__label__bho_Deva', '__label__apc_Arab', '__label__bak_Cyrl', '__label__jav_Latn', '__label__yue_Hani', '__label__pbt_Arab', '__label__khm_Khmr', '__label__npi_Deva', '__label__npi_Latn', '__label__gug_Latn', '__label__uig_Arab', '__label__fur_Latn', '__label__kbp_Latn', '__label__hne_Deva', '__label__kam_Latn', '__label__gla_Latn', '__label__kab_Latn', '__label__arz_Arab', '__label__kaz_Cyrl', '__label__mri_Latn', '__label__lim_Latn', '__label__srd_Latn', '__label__sun_Latn', '__label__plt_Latn', '__label__mni_Beng', '__label__isl_Latn', '__label__vec_Latn', '__label__glg_Latn', '__label__scn_Latn', '__label__fao_Latn', '__label__san_Deva', '__label__ltz_Latn', '__label__cjk_Latn', '__label__ast_Latn', '__label__lmo_Latn', '__label__szl_Latn', '__label__oci_Latn', '__label__fon_Latn', '__label__min_Latn', '__label__wol_Latn', '__label__lij_Latn', '__label__ajp_Arab', '__label__snd_Arab', '__label__dik_Latn', '__label__ary_Arab', '__label__lao_Laoo', '__label__ars_Arab', '__label__bjn_Latn', '__label__shn_Mymr', '__label__crh_Latn', '__label__aeb_Arab', '__label__ace_Latn', '__label__ckb_Arab', '__label__dyu_Latn', '__label__ltg_Latn', '__label__kmr_Latn', '__label__ban_Latn', '__label__mai_Deva', '__label__fuv_Latn', '__label__kac_Latn', '__label__taq_Latn', '__label__bam_Latn', '__label__sat_Olck', '__label__tzm_Tfng', '__label__bug_Latn', '__label__dzo_Tibt', '__label__kas_Deva', '__label__fas_Arab', '__label__nus_Latn', '__label__knc_Latn', '__label__mag_Deva', '__label__taq_Tfng', '__label__kas_Arab', '__label__knc_Arab', '__label__bjn_Arab', '__label__ace_Arab', '__label__kea_Latn', '__label__awa_Deva', '__label__acm_Arab', '__label__bod_Tibt', '__label__sot_Latn', '__label__ydd_Hebr', '__label__azb_Arab']

#custom model from limited languages

In [6]:
try1 = MaskLID(model_path=open_model_path, languages=-1)

In [7]:
try2 = MaskLID(model_path=glot_model_path, languages=flores_glotlid)

In [8]:
df_te = pd.read_csv('df_te.csv')

In [9]:
df_t = pd.read_csv('df_turk.csv')

In [10]:
df_te1 = df_te.copy()

In [11]:
df_te2 = df_te.copy()

In [12]:
df_t1 = df_t.copy()

In [13]:
df_t2 = df_t.copy()

In [14]:
def evaluate_masklid_predictions(df, assignments, store_column="Predicted"):
    em_count = 0
    pm_count = 0
    fp_count = 0

    em_indices = []
    pm_indices = []
    fp_indices = []

    total = len(df)
    total_cs = sum(len(gold) > 1 for gold in df["True Labels"])

    df[store_column] = None

    for i, row in df.iterrows():
        gold = row["True Labels"]
        pred = assignments[i]  # list of (word, label)
        pred_labels = {label for (_, label) in pred}
        df.at[i, store_column] = pred_labels

        if not isinstance(gold, set):
            gold = set(gold)

        if len(gold) > 1:  # CS sentence
            if pred_labels == gold:
                em_count += 1
                pm_count += 1
                em_indices.append(i)
                pm_indices.append(i)
            elif len(pred_labels) == 1 and next(iter(pred_labels)) in gold:
                pm_count += 1
                pm_indices.append(i)
            elif len(pred_labels) > 1:
                if len(gold - pred_labels) == 1 :  # includes unexpected label(s)
                    fp_count += 1
                    fp_indices.append(i)

        else:  # Mono sentence
            if pred_labels == gold:
                em_count += 1
                pm_count += 1
                em_indices.append(i)
                pm_indices.append(i)
            elif gold & pred_labels:
                pm_count += 1
                pm_indices.append(i)
            # FP not counted for mono sentences

    result = {
        "EM": em_count,
        "PM": pm_count,
        "FP": fp_count,
        "Total": total,
        "Total_CS": total_cs,
        "EM_idx": em_indices,
        "PM_idx": pm_indices,
        "FP_idx": fp_indices,
    }

    return result

OpenLID with Turkish-ENglish Dataset

In [15]:
from gurobipy import Model, GRB, quicksum

# Konfigurasi
min_len = 15  # minimum length of characters for a language to be considered valid
alpha = 2     # maximum number of active languages (according to top-alpha)

results = []
open_assignments_te = []

# Looping for each line (sentence) in the dataset
for idx, row in df_te1.iterrows():
    text = row["Text"]

    # Step 1: Calculate the logit scores per word from the FastText model
    scores_dict = try1.compute_v_per_word(text)

    # Step 2: Take the top-α of each word
    top_alpha_languages = set()
    for word in scores_dict:
        sorted_logits = sorted(scores_dict[word]['logits'], key=lambda x: -x[1])
        top_alpha = [lang for lang, _ in sorted_logits[:alpha]]
        top_alpha_languages.update(top_alpha)

    words = list(scores_dict.keys())
    languages = list(top_alpha_languages)

    # Step 3: Take the logit values ​​b_ij (parameters, not variables)
    b = {}
    for i in words:
        for j, score in scores_dict[i]['logits']:
            if j in languages:
                b[(i, j)] = score

    # Step 4: Initialize the Gurobi model
    model = Model("MaskLID_ILP")
    model.setParam('OutputFlag', 0)

    # Main variables
    x = {(i, j): model.addVar(vtype=GRB.BINARY, name=f"x_{i}_{j}") for (i, j) in b}
    y = {j: model.addVar(vtype=GRB.BINARY, name=f"y_{j}") for j in languages}
    model.update()

    # Character length of each word
    word_lengths = {w: len(w.split('_', 1)[1]) for w in words}

    # Constraint 1: One word only one language label
    for i in words:
        model.addConstr(quicksum(x[(i, j)] for j in languages if (i, j) in x) == 1)

    # Constraint 2: Language j is only active if there is an active x_ij
    for j in languages:
        model.addConstr(quicksum(x[(i, j)] for i in words if (i, j) in x) <= len(words) * y[j])

    # Constraint 3: Total word length per language ≥ min_len if y_j is active
    for j in languages:
        model.addConstr(
            quicksum(x[(i, j)] * word_lengths[i] for i in words if (i, j) in x) >= min_len * y[j],
            name=f"length_constraint_{j}"
        )

    # Constraint 4: Total active languages ​​≤ alpha
    model.addConstr(quicksum(y[j] for j in languages) <= alpha)

    # Objective: maximize total logit
    model.setObjective(quicksum(x[(i, j)] * b[(i, j)] for (i, j) in x), GRB.MAXIMIZE)
    model.optimize()

    # Get assignment results
    assignment = []
    for (i, j) in x:
        if x[(i, j)].X > 0.5:
            word = i.split('_', 1)[1]
            assignment.append((word, j))

    open_assignments_te.append(assignment)
    results.append(assignment)

Set parameter Username
Academic license - for non-commercial use only - expires 2025-11-12


In [16]:
# all_assignments[:10]

In [17]:
import ast

# Convert True Labels to real sets
if isinstance(df_te1.iloc[0]["True Labels"], str):
    df_te1["True Labels"] = df_te1["True Labels"].apply(ast.literal_eval)

In [18]:
result_te1 = evaluate_masklid_predictions(df_te1, open_assignments_te)

print(f"Exact Match (EM): {result_te1['EM']} ")
print(f"Partial Match (PM): {result_te1['PM']}")
print(f"False Positives (FP - CS only): {result_te1['FP']}")

Exact Match (EM): 135 
Partial Match (PM): 143
False Positives (FP - CS only): 186


GlotLID with Turkish-ENglish Dataset

In [19]:

from gurobipy import Model, GRB, quicksum

# Konfigurasi
min_len = 15  
alpha = 2     

results = []
glot_assignments_te = []


for idx, row in df_te2.iterrows():
    text = row["Text"]

    
    scores_dict = try2.compute_v_per_word(text)

    
    top_alpha_languages = set()
    for word in scores_dict:
        sorted_logits = sorted(scores_dict[word]['logits'], key=lambda x: -x[1])
        top_alpha = [lang for lang, _ in sorted_logits[:alpha]]
        top_alpha_languages.update(top_alpha)

    words = list(scores_dict.keys())
    languages = list(top_alpha_languages)

    
    b = {}
    for i in words:
        for j, score in scores_dict[i]['logits']:
            if j in languages:
                b[(i, j)] = score

   
    model = Model("MaskLID_ILP")
    model.setParam('OutputFlag', 0)

    # Variabel utama
    x = {(i, j): model.addVar(vtype=GRB.BINARY, name=f"x_{i}_{j}") for (i, j) in b}
    y = {j: model.addVar(vtype=GRB.BINARY, name=f"y_{j}") for j in languages}
    model.update()

    
    word_lengths = {w: len(w.split('_', 1)[1]) for w in words}

    
    for i in words:
        model.addConstr(quicksum(x[(i, j)] for j in languages if (i, j) in x) == 1)

   
    for j in languages:
        model.addConstr(quicksum(x[(i, j)] for i in words if (i, j) in x) <= len(words) * y[j])

    
    for j in languages:
        model.addConstr(
            quicksum(x[(i, j)] * word_lengths[i] for i in words if (i, j) in x) >= min_len * y[j],
            name=f"length_constraint_{j}"
        )

   
    model.addConstr(quicksum(y[j] for j in languages) <= alpha)

    
    model.setObjective(quicksum(x[(i, j)] * b[(i, j)] for (i, j) in x), GRB.MAXIMIZE)
    model.optimize()

    
    assignment = []
    for (i, j) in x:
        if x[(i, j)].X > 0.5:
            word = i.split('_', 1)[1]
            assignment.append((word, j))

    glot_assignments_te.append(assignment)
    results.append(assignment)

In [20]:
import ast


if isinstance(df_te2.iloc[0]["True Labels"], str):
    df_te2["True Labels"] = df_te2["True Labels"].apply(ast.literal_eval)

In [21]:
result_te2 = evaluate_masklid_predictions(df_te1, glot_assignments_te)

print(f"Exact Match (EM): {result_te2['EM']} ")
print(f"Partial Match (PM): {result_te2['PM']}")
print(f"False Positives (FP - CS only): {result_te2['FP']}")

Exact Match (EM): 127 
Partial Match (PM): 133
False Positives (FP - CS only): 199


OpenLID with Turkish Only Dataset

In [22]:
#Final
from gurobipy import Model, GRB, quicksum

# Konfigurasi
min_len = 15  
alpha = 2     

results = []
open_assignments_t = []


for idx, row in df_t1.iterrows():
    text = row["Text"]

    
    scores_dict = try1.compute_v_per_word(text)

    
    top_alpha_languages = set()
    for word in scores_dict:
        sorted_logits = sorted(scores_dict[word]['logits'], key=lambda x: -x[1])
        top_alpha = [lang for lang, _ in sorted_logits[:alpha]]
        top_alpha_languages.update(top_alpha)

    words = list(scores_dict.keys())
    languages = list(top_alpha_languages)

    
    b = {}
    for i in words:
        for j, score in scores_dict[i]['logits']:
            if j in languages:
                b[(i, j)] = score

   
    model = Model("MaskLID_ILP")
    model.setParam('OutputFlag', 0)

   
    x = {(i, j): model.addVar(vtype=GRB.BINARY, name=f"x_{i}_{j}") for (i, j) in b}
    y = {j: model.addVar(vtype=GRB.BINARY, name=f"y_{j}") for j in languages}
    model.update()

    
    word_lengths = {w: len(w.split('_', 1)[1]) for w in words}

   
    for i in words:
        model.addConstr(quicksum(x[(i, j)] for j in languages if (i, j) in x) == 1)

   
    for j in languages:
        model.addConstr(quicksum(x[(i, j)] for i in words if (i, j) in x) <= len(words) * y[j])

    
    for j in languages:
        model.addConstr(
            quicksum(x[(i, j)] * word_lengths[i] for i in words if (i, j) in x) >= min_len * y[j],
            name=f"length_constraint_{j}"
        )

    model.addConstr(quicksum(y[j] for j in languages) <= alpha)

  
    model.setObjective(quicksum(x[(i, j)] * b[(i, j)] for (i, j) in x), GRB.MAXIMIZE)
    model.optimize()

   
    assignment = []
    for (i, j) in x:
        if x[(i, j)].X > 0.5:
            word = i.split('_', 1)[1]
            assignment.append((word, j))

    open_assignments_t.append(assignment)
    results.append(assignment)

In [23]:
import ast


if isinstance(df_t1.iloc[0]["True Labels"], str):
    df_t1["True Labels"] = df_t1["True Labels"].apply(ast.literal_eval)

In [24]:
result_t1 = evaluate_masklid_predictions(df_t1, open_assignments_t)

print(f"Exact Match (EM): {result_t1['EM']} ")
print(f"Partial Match (PM): {result_t1['PM']}")
print(f"False Positives (FP - CS only): {result_t1['FP']}")

Exact Match (EM): 107 
Partial Match (PM): 332
False Positives (FP - CS only): 0


GlotLID with Turkish Only Dataset

In [25]:
#Final
from gurobipy import Model, GRB, quicksum

# Konfigurasi
min_len = 15  
alpha = 2     

results = []
glot_assignments_t = []


for idx, row in df_t2.iterrows():
    text = row["Text"]

   
    scores_dict = try2.compute_v_per_word(text)

  
    top_alpha_languages = set()
    for word in scores_dict:
        sorted_logits = sorted(scores_dict[word]['logits'], key=lambda x: -x[1])
        top_alpha = [lang for lang, _ in sorted_logits[:alpha]]
        top_alpha_languages.update(top_alpha)

    words = list(scores_dict.keys())
    languages = list(top_alpha_languages)

    
    b = {}
    for i in words:
        for j, score in scores_dict[i]['logits']:
            if j in languages:
                b[(i, j)] = score

  
    model = Model("MaskLID_ILP")
    model.setParam('OutputFlag', 0)

  
    x = {(i, j): model.addVar(vtype=GRB.BINARY, name=f"x_{i}_{j}") for (i, j) in b}
    y = {j: model.addVar(vtype=GRB.BINARY, name=f"y_{j}") for j in languages}
    model.update()

   
    word_lengths = {w: len(w.split('_', 1)[1]) for w in words}

  
    for i in words:
        model.addConstr(quicksum(x[(i, j)] for j in languages if (i, j) in x) == 1)

  
    for j in languages:
        model.addConstr(quicksum(x[(i, j)] for i in words if (i, j) in x) <= len(words) * y[j])

   
    for j in languages:
        model.addConstr(
            quicksum(x[(i, j)] * word_lengths[i] for i in words if (i, j) in x) >= min_len * y[j],
            name=f"length_constraint_{j}"
        )

   
    model.addConstr(quicksum(y[j] for j in languages) <= alpha)

 
    model.setObjective(quicksum(x[(i, j)] * b[(i, j)] for (i, j) in x), GRB.MAXIMIZE)
    model.optimize()

    assignment = []
    for (i, j) in x:
        if x[(i, j)].X > 0.5:
            word = i.split('_', 1)[1]
            assignment.append((word, j))

    glot_assignments_t.append(assignment)
    results.append(assignment)

In [26]:
import ast

# Convert True Labels jadi set beneran
if isinstance(df_t2.iloc[0]["True Labels"], str):
    df_t2["True Labels"] = df_t2["True Labels"].apply(ast.literal_eval)

In [27]:
result_t2 = evaluate_masklid_predictions(df_t2, glot_assignments_t)

print(f"Exact Match (EM): {result_t2['EM']} ")
print(f"Partial Match (PM): {result_t2['PM']}")
print(f"False Positives (FP - CS only): {result_t2['FP']}")

Exact Match (EM): 109 
Partial Match (PM): 332
False Positives (FP - CS only): 0
