# Language classification analysis

In [11]:
import os
import csv
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [2]:
# Load the data from flores200
dev_dir = "data/flores200_dataset/dev"
dev_lines = []
dev_labels = []
for filename in os.listdir(dev_dir):
    if not filename.endswith(".dev"):
        continue
    label = filename[:-4]
    with open(os.path.join(dev_dir, filename), "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            dev_lines.append(line)
            dev_labels.append(label)
trg_langs = list(set(dev_labels))

In [3]:
# Create a vectorizer.
vectorizer = CountVectorizer(ngram_range=(1, 5), analyzer='char', min_df=2)

# Train the classifier.
print(dev_lines[:5])
print("Vectorizing data...")
vectorized_data = vectorizer.fit_transform(dev_lines)


['Tɛnɛndo, Stanford kɛnɛya sanfɛkalanso donnikɛbaw ye baana sƐgƐ sƐgƐli minan kura do dilani kumalase min ni a be se ka seliliw suguya woloma: pisi fitifitinin do min  bese ka dilan ni Ɛnpirmanti ankirima ye US wari tama ɲɔgɔnna a kelen o kelen songɔ ye .', 'Ɲinninikɛla jɔnjɔnw  ko ko o be se ka to boɔ bana, Sɔgɔsɔgɔnijɛ, SIDA bana ani Sumaya sƐgƐsƐgƐli joona ka se ka kƐ banabatɔw dɛsɛbato jamana la, yɔrɔ minw na ni boɔbanatɔ inafɔ sin na boɔ balota hakƐ bese ka dɔgɔya ni setigi jamanaw ta tila.', 'Pankunru Gripen JAS 39C binna pankunru jigi kɛnɛ dɔ kan sɔkɔma nɛkɛ kan ɲɛ 9:30 (0230 UTC) ni ka mɛnɛ, min naara kɛra sabu ye ka pankunru jiginkɛnɛ datugu jakokɛ pankunru ɲɛ. pankuluw ɲɛ.', 'Pankulu boli ba lakodɔnna ka kɛ Esekadɔron Leader Dilokrit Pattavee ye.', "Yen kunafoni dilaw y' a lase ko pankunruw sow tasuma fagamobili dɔ binna u tasuma fagatɔ."]
Vectorizing data...


In [4]:
# Create a classifier.
classifier = LogisticRegression(C=0.1, penalty='l2', solver='saga', multi_class='multinomial', verbose=1)
# classifier = LinearSVC(C=1.0, penalty='l2', multi_class='crammer_singer', verbose=1)

print("Training classifier...")
classifier.fit(vectorized_data, dev_labels)

Training classifier...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Epoch 1, change: 1.00000000
Epoch 2, change: 0.29965513
Epoch 3, change: 0.21586620
Epoch 4, change: 0.16529818
Epoch 5, change: 0.12484934
Epoch 6, change: 0.09648277
Epoch 7, change: 0.07956629
Epoch 8, change: 0.06747003
Epoch 9, change: 0.05816456
Epoch 10, change: 0.05100162
Epoch 11, change: 0.04515654
Epoch 12, change: 0.04071531
Epoch 13, change: 0.03676552
Epoch 14, change: 0.03355247
Epoch 15, change: 0.03073323
Epoch 16, change: 0.02845569
Epoch 17, change: 0.02633987
Epoch 18, change: 0.02472991
Epoch 19, change: 0.02306868
Epoch 20, change: 0.02171849
Epoch 21, change: 0.02043641
Epoch 22, change: 0.01940903
Epoch 23, change: 0.01839339
Epoch 24, change: 0.01728161
Epoch 25, change: 0.01644636
Epoch 26, change: 0.01558724
Epoch 27, change: 0.01482529
Epoch 28, change: 0.01411338
Epoch 29, change: 0.01348594
Epoch 30, change: 0.01284558
Epoch 31, change: 0.01229666
Epoch 32, change: 0.01181610
Epoch 33, change: 0.01132035
Epoch 34, change: 0.01088529
Epoch 35, change: 0.010

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 129.8min finished


In [7]:
def load_hyp_from_tsv(filename: str) -> list[str]:
    with open(filename, "r") as f:
        # Load the third column from the csv reader after the header
        reader = csv.reader(f, delimiter="\t")
        next(reader)
        return [x[2] for x in reader]

# get all the files in the system_outputs folder
all_results = {}
for subdir in os.listdir("system_outputs"):
    # The sub_directory must be a directory
    subdir_path = os.path.join("system_outputs", subdir)
    if not os.path.isdir(subdir_path):
        raise ValueError(f"{subdir_path=} is not a directory")
    # get all the files in the sub_directory
    all_files = os.listdir(subdir_path)
    for trg_lang in trg_langs:
        # find the tsv file
        trg_file = [f for f in all_files if f.endswith(f"{trg_lang}.tsv")]
        if len(trg_file) == 1:
            all_results[trg_lang, subdir] = load_hyp_from_tsv(os.path.join(subdir_path, trg_file[0]))
            continue
        # find the hyp file
        trg_file = [f for f in all_files if f.endswith(f"{trg_lang}-devtest.hyp")]
        if len(trg_file) == 1:
            with open(os.path.join(subdir_path, trg_file[0]), "r") as f:
                all_results[trg_lang, subdir] = [x.strip() for x in f.readlines()]
            continue

In [10]:
all_langs = {}
for trg_lang, model in sorted(all_results.keys()):
    # Do classification
    data = all_results[(trg_lang, model)]
    encoded_data = vectorizer.transform(data)
    predicted_langs = classifier.predict(encoded_data)
    all_langs[(trg_lang, model)] = predicted_langs
    counts = Counter(predicted_langs)
    # Print top 5 counts
    top_5 = ", ".join(f"{lang}:{count}" for lang, count in counts.most_common(5))
    print(f"{trg_lang}, {model}: acc={counts[trg_lang]/len(predicted_langs):.2f}, top 5: {top_5}")

ace_Arab, nllb_moe: acc=0.11, top 5: min_Arab:489, bjn_Arab:340, ace_Arab:108, knc_Arab:15, ind_Latn:15
ace_Arab, tt-five: acc=0.58, top 5: ace_Arab:585, bjn_Arab:218, ind_Latn:61, min_Arab:45, ace_Latn:26
ace_Arab, tt-zero: acc=0.10, top 5: bjn_Arab:289, arb_Arab:118, ace_Arab:102, ace_Latn:81, ars_Arab:67
ace_Latn, nllb_moe: acc=0.68, top 5: ace_Latn:686, ind_Latn:210, zsm_Latn:58, sun_Latn:27, ban_Latn:10
ace_Latn, tt-five: acc=0.82, top 5: ace_Latn:826, ind_Latn:126, zsm_Latn:25, sun_Latn:10, luo_Latn:9
ace_Latn, tt-zero: acc=0.62, top 5: ace_Latn:627, ind_Latn:239, zsm_Latn:54, sun_Latn:38, min_Latn:16
acm_Arab, gpt4_tt-five: acc=0.19, top 5: arb_Arab:266, acm_Arab:190, ars_Arab:156, acq_Arab:116, aeb_Arab:76
acm_Arab, nllb_moe: acc=0.12, top 5: knc_Arab:296, arb_Arab:173, ars_Arab:122, acm_Arab:120, acq_Arab:100
acm_Arab, tt-five: acc=0.07, top 5: arb_Arab:339, ars_Arab:254, acq_Arab:106, aeb_Arab:74, acm_Arab:71
acm_Arab, tt-zero: acc=0.08, top 5: arb_Arab:314, ars_Arab:215, aeb

In [13]:
all_models = sorted(list(set([x for _, x in all_results.keys()])))
lang_id_accuracies = {x: [] for x in all_models}
for (lang, model), langid_results in all_langs.items():
    lang_id_accuracies[model].extend(1.0 if x == lang else 0 for x in langid_results)
for lang, accuracies in lang_id_accuracies.items():
    lang_id_accuracies[lang] = sum(accuracies) / len(accuracies)
    print(f"{lang} lang_id_accuracy: {lang_id_accuracies[lang]}")

gpt4_tt-five lang_id_accuracy: 0.8966897233201581
nllb_moe lang_id_accuracy: 0.909764419011661
tt-five lang_id_accuracy: 0.8335929437878463
tt-zero lang_id_accuracy: 0.7240600478981288
