In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.ticker
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm

# import mplcairo
# matplotlib.use("module://mplcairo.macosx")
# print(matplotlib.get_backend())

In [2]:
# !coman-python src/results.py udpos -q -mt ft --digest b30906d4 --sort_first

In [3]:
# with open("research/udpos/langs-ud.txt") as f:
#     ud_langs = {line.rstrip().split(" ", maxsplit=1)[0]: line.rstrip().split(" ", maxsplit=1)[1] for line in f}

ud_df = pd.read_csv("exports/udpos-langs.csv")
ud_langs = {row["language_id"]: row["language"] for _, row in ud_df.iterrows()}
ud_langs_scripts = {row["language"]: row["script"] for _, row in ud_df.iterrows()}

wiki_df = pd.read_csv("exports/wiki-langs.csv")
wiki_langs = {row["language_id"]: row["language"] for _, row in wiki_df.iterrows()}
# wiki_langs_scripts = {row["language"]: row["script"] for _, row in wiki_df.iterrows()}

with open("research/udpos/fams.txt") as f:
    lang_branches = {line.rstrip().split(" ", maxsplit=1)[0].replace("_", " "): line.rstrip().split(" ", maxsplit=1)[1] for line in f}

lang_fams = {lang: "Indo-European" if branch.startswith("IE") else branch for lang, branch in lang_branches.items()}

# with open("research/udpos/langs-xlmr.txt") as f:
#     model_langs = {line.rstrip().split(" ", maxsplit=1)[0]: line.rstrip().split(" ", maxsplit=1)[1] for line in f}

# with open("research/udpos/langs-xlmr-warn.txt") as f:
#     model_langs_warn = {line.rstrip().split(" ", maxsplit=1)[0]: line.rstrip().split(" ", maxsplit=1)[1] for line in f}

ldnd_df = pd.read_csv("research/udpos/ldnd.csv", index_col=0)
order_df = pd.read_csv("research/udpos/word-order.csv", index_col=1)

model_langs_warn = {}

ud_lang_names = set(ud_df.language.tolist())
model_lang_names = set(wiki_df.language.tolist())
model_lang_names_warn = set()

In [4]:
print(len(ud_lang_names), len(model_lang_names), len(ud_lang_names & model_lang_names), len(ud_lang_names - model_lang_names))

105 94 58 47


In [18]:
n_samples = 10_000
if n_samples < 0:
    df = pd.read_csv(f"exports/udpos-ft-{-n_samples}epoch.csv")
else:
    df = pd.read_csv(f"exports/udpos-ft-{n_samples:0>5}.csv")

size_df = pd.read_csv("exports/udpos-sizes-train.csv", usecols=["language","size"], index_col="language")
if n_samples < 0:
    df["full_train_size"] = df.lang_train.map(lambda x: size_df.loc[x, "size"])
    df["train_size"] = df["full_train_size"]
else:
    df["full_train_size"] = df.lang_train.map(lambda x: size_df.loc[x, "size"])
    df["train_size"] = df.full_train_size.map(lambda x: min(x, n_samples))  # type: ignore

df["train_size_tiny"] = df.train_size < 200
    
df["num_samples"] = n_samples
df["sampling"] = "none" if n_samples < 0 else df.train_size.map(lambda x: "undersampled" if x >= n_samples else "oversampled")
df["sampling"] = df.sampling.astype("category")

df["lang_train"] = df.lang_train.map(ud_langs.get)
df["lang_pred"] = df.lang_pred.map(ud_langs.get)
df["lang_train_pred"] = df["lang_train"] + df["lang_pred"]
df["lang_same"] = df["lang_train"] == df["lang_pred"]

df["lang_train_family"] = df.lang_train.map(lang_fams.get)
df["lang_pred_family"] = df.lang_pred.map(lang_fams.get)
df["family_same"] = df["lang_train_family"] == df["lang_pred_family"]

df["lang_train_branch"] = df.lang_train.map(lang_branches.get)
df["lang_pred_branch"] = df.lang_pred.map(lang_branches.get)
df["branch_same"] = df["lang_train_branch"] == df["lang_pred_branch"]

df["lang_train_ie"] = df.lang_train.map(lambda x: lang_fams.get(x).startswith("IE"))
df["lang_pred_ie"] = df.lang_pred.map(lambda x: lang_fams.get(x).startswith("IE"))
df["ie_same"] = df["lang_train_ie"] == df["lang_pred_ie"]

df["lang_train_pretrained"] = df.lang_train.map(lambda x: x in model_lang_names)
df["lang_pred_pretrained"] = df.lang_pred.map(lambda x: x in model_lang_names)
df["pretrained_same"] = df["lang_train_pretrained"] == df["lang_pred_pretrained"]

df["related_train_pretrained"] = df.lang_train.map(lambda x: x in model_lang_names or x in ["Western Armenian", "Faroese", "Old East Slavic"])
df["related_pred_pretrained"] = df.lang_pred.map(lambda x: x in model_lang_names or x in ["Western Armenian", "Faroese", "Old East Slavic"])

df["lang_train_script"] = df.lang_train.map(ud_langs_scripts.get)
df["lang_pred_script"] = df.lang_pred.map(ud_langs_scripts.get)
df["script_same"] = df["lang_train_script"] == df["lang_pred_script"]


def get_ldnd(x):
    try:
        y = ldnd_df.loc[x.lang_train, x.lang_pred]
    except KeyError:
        y = 100
    if pd.isnull(y):
        return 0
    return y
df["ldnd"] = df.apply(get_ldnd, axis=1)

def get_sov(x):
    try:
        return order_df.loc[x.lang_train, x.lang_pred].SOV
    except KeyError:
        return np.nan

# ["language", "SOV", "SV", "OV", "OOV", "AdpNP", "GN", "AN", "DN", "NN", "RN", "QP"]
df["sov_order_train"] = df.lang_train.map(lambda x: order_df.SOV.get(x, "Unknown"))
df["sov_order_pred"] = df.lang_pred.map(lambda x: order_df.SOV.get(x, "Unknown"))
df["sov_order_same"] = df["sov_order_train"] == df["sov_order_pred"]

script_types = {
    "latin": "alphabetic",
    "cyrillic": "alphabetic",
    "armenian": "alphabetic",
    "hangul": "logosyllabic",
    "chinese": "logosyllabic",
    "greek": "alphabetic",
    "devanagari": "abugida",
    "tamil": "abugida",
    "arabic": "abjad",
    "kana": "logosyllabic",
    "telugu": "abugida",
    "hebrew": "abjad",
    "syriac": "abjad",
    "thai": "abjad",
    "old turkic": "alphabetic",
}

df["lang_train_script_type"] = df.lang_train_script.map(lambda x: script_types[x])
df["lang_pred_script_type"] = df.lang_pred_script.map(lambda x: script_types[x])
df["script_type_same"] = df["lang_train_script_type"] == df["lang_pred_script_type"]

df = df.loc[~df.lang_same]
df

Unnamed: 0.1,Unnamed: 0,task_type,task_name,digest,lang_train,lang_pred,model_id,model_type,score,full_train_size,...,lang_train_script,lang_pred_script,script_same,ldnd,sov_order_train,sov_order_pred,sov_order_same,lang_train_script_type,lang_pred_script_type,script_type_same
1,1,token-classification,udpos28,1d6ca3e8,English,Dutch,xlm-roberta-base,ft,90.391066,19912,...,latin,latin,True,61.13,SVO,No dominant order,False,alphabetic,alphabetic,True
2,2,token-classification,udpos28,1d6ca3e8,English,German,xlm-roberta-base,ft,88.613954,19912,...,latin,latin,True,67.33,SVO,No dominant order,False,alphabetic,alphabetic,True
3,3,token-classification,udpos28,1d6ca3e8,English,Italian,xlm-roberta-base,ft,87.774192,19912,...,latin,latin,True,89.88,SVO,SVO,True,alphabetic,alphabetic,True
4,4,token-classification,udpos28,1d6ca3e8,English,French,xlm-roberta-base,ft,87.440783,19912,...,latin,latin,True,91.35,SVO,SVO,True,alphabetic,alphabetic,True
5,5,token-classification,udpos28,1d6ca3e8,English,Spanish,xlm-roberta-base,ft,90.279240,19912,...,latin,latin,True,94.14,SVO,SVO,True,alphabetic,alphabetic,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6819,6819,token-classification,udpos28,1d6ca3e8,Uyghur,Western Armenian,xlm-roberta-base,ft,71.766482,1543,...,arabic,armenian,False,99.46,SOV,SOV,True,abjad,alphabetic,False
6820,6820,token-classification,udpos28,1d6ca3e8,Uyghur,Scottish Gaelic,xlm-roberta-base,ft,48.034398,1543,...,arabic,latin,False,101.36,SOV,VSO,False,abjad,alphabetic,False
6821,6821,token-classification,udpos28,1d6ca3e8,Uyghur,Khunsari,xlm-roberta-base,ft,52.702703,1543,...,arabic,arabic,True,100.00,SOV,Unknown,False,abjad,abjad,True
6822,6822,token-classification,udpos28,1d6ca3e8,Uyghur,Hebrew,xlm-roberta-base,ft,77.083333,1543,...,arabic,hebrew,False,100.13,SOV,SVO,False,abjad,abjad,True


In [19]:
df.reset_index().to_csv("results.csv")

In [16]:
df.loc[df.lang_train_pretrained != df.related_train_pretrained].lang_train.unique()

array(['Old East Slavic', 'Faroese', 'Western Armenian'], dtype=object)

In [7]:
m0 = smf.mixedlm(f"score ~ family_same + lang_pred_pretrained + pretrained_same + lang_pred_ie + ie_same + sov_order_pred + sov_order_same + ldnd", groups=df["lang_pred"], data=df).fit(method=["lbfgs"])
m1 = smf.mixedlm(f"score ~ lang_pred_pretrained + pretrained_same + lang_pred_ie + ie_same + sov_order_pred + sov_order_same + ldnd", groups=df["lang_pred"], data=df).fit(method=["lbfgs"])

# anova_lm(m0, m1)
m0.summary()

LinAlgError: Singular matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.lmplot()

In [11]:
df.loc[:, ["lang_pred_script", "lang_pred_script_type"]].drop_duplicates()

Unnamed: 0,lang_pred_script,lang_pred_script_type
1,latin,alphabetic
6,cyrillic,alphabetic
12,armenian,alphabetic
30,hangul,logosyllabic
35,syriac,abjad
39,chinese,logosyllabic
45,greek,alphabetic
49,devanagari,abugida
50,thai,abjad
57,tamil,abugida
