# Note
This is all work in progress, the main point of these plots is to show that
family / genus is not a great proxy for 'typologically diverse' sampling.
Ideas currently:
- Closest language based on typ distance is not necessarily in the same fam (what does this mean)
- Overlap of features varies considerably (what does this mean)
- ...

In [45]:
import pandas as pd
from pathlib import Path
import numpy as np
import itertools
import altair as alt

DATA = Path().cwd().parent / "data"

gb = pd.read_csv(DATA / "gb_lang_feat_vals.csv", index_col="Lang_ID")
gb = gb.drop(["Unnamed: 0"], axis=1)
df = pd.read_csv("../grambank/cldf/languages.csv")
df_f = pd.read_csv("../grambank/cldf/families.csv")
df_w = pd.read_csv(DATA / "wals_dedup.csv")
dists = pd.read_csv(DATA / "gb_vec_sim_0.csv", index_col=0)

In [25]:
df[df['Family_name'].isna()]

Unnamed: 0,ID,Name,Macroarea,Latitude,Longitude,Glottocode,ISO639P3code,provenance,Family_name,Family_level_ID,Language_level_ID,level,lineage
9,abun1252,Abun,Papunesia,-0.57073,132.4160,abun1252,,JLA_abun1252.tsv,,,abun1252,language,
37,aika1237,Aikanã,South America,-12.66950,-60.5353,aika1237,,MM_aika1237.tsv,,,aika1237,language,
62,alse1251,Alsea-Yaquina,North America,44.40800,-123.9400,alse1251,,JLA_alse1251.tsv,,,alse1251,language,
80,ando1256,Andoque,South America,-0.53751,-72.0869,ando1256,,CB-PE-AS_ando1256.tsv,,,ando1256,language,
82,anem1249,Anem,Papunesia,-5.54836,148.9930,anem1249,,MD-GR-RSI_anem1249.tsv,,,anem1249,language,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2388,yana1271,Yana,North America,39.27690,-121.6080,yana1271,,JLA_yana1271.tsv,,,yana1271,language,
2412,yele1255,Yele,Papunesia,-11.37960,154.1270,yele1255,,MD-GR-RSI_yele1255.tsv,,,yele1255,language,
2434,yuch1247,Yuchi,North America,35.75000,-86.7500,yuch1247,,GB_yuch1247.tsv,,,yuch1247,language,
2442,yura1255,Yuracaré,South America,-16.74790,-65.1224,yura1255,,JLA_yura1255.tsv,,,yura1255,language,


In [26]:
in_family_count, out_family_count = 0, 0

df_with_fam = df[df["Family_name"].notnull()]

for lang_a, row in dists.iterrows():
    # filter out same language and NaN errors
    langs = row[row == row[row > 0].min()].index
    # go through all closest languages (min can be the same value for multiple)
    for lang_b in langs:
        fam_a = df_with_fam[df_with_fam["ID"] == lang_a]["Family_name"].tolist()
        fam_b = df_with_fam[df_with_fam["ID"] == lang_b]["Family_name"].tolist()
        if set(fam_a) == set(fam_b):
            in_family_count += 1
        else:
            out_family_count += 1

total_comparisons = in_family_count + out_family_count
print(f"{in_family_count=}, {out_family_count=}, {total_comparisons=}")

in_family_count=1625, out_family_count=3386, total_comparisons=5011


In [27]:
def get_gb_feature_cov(gb, treat_as_missing):
    max_coverage_per_feature = dict()
    for feature_name, series in gb.items():
        if not feature_name.startswith("GB"):
            continue
        max_coverage_per_feature[feature_name] = {
            i for i in series.unique() if i not in treat_as_missing
        }
    return max_coverage_per_feature

In [28]:
GB_FEATURE_MISSING = {"no_cov", "?"}
# get all possible values per feature (excluding missing)
gb_max_coverage = get_gb_feature_cov(gb, GB_FEATURE_MISSING)

In [29]:
gb_by_lang = {i: np.array(row) for i, row in gb.iterrows()}

In [30]:
def calc_feature_val_overlap(langs):
    if len(langs) < 2:
        # print(f"ISOLATE ALERT: {langs}")
        return 0

    # get all pairs (no repetition, order does not matter)
    comb = itertools.combinations(langs, 2)
    all_overlap = []
    for lang_a, lang_b in comb:
        # this is an element-wise boolean array, true is 1
        # TODO: what do missing / no cov values mean here?
        all_overlap.append((gb_by_lang[lang_a] == gb_by_lang[lang_b]).mean())
        # for a, b in zip(gb_by_lang[lang_a], gb_by_lang[lang_b]):
        # if (a in GB_FEATURE_MISSING )or (b in GB_FEATURE_MISSING):
        # continue
        # all_overlap.append(int(a == b))

    return sum(all_overlap) / len(all_overlap)

In [56]:
results = pd.DataFrame()

for group, members in df.groupby("Family_name")["Glottocode"]:
    g_codes = members.tolist()
    # print(group)
    # sns.heatmap(
    #     gb[gb.index.isin(g_codes)].transpose(),
    #     ax=ax,
    #     # cmap="coolwarm",
    #     cbar_kws={"shrink": 0.3, "location": "top", "orientation": "horizontal"},
    # )
    # break

    avg_overlap = calc_feature_val_overlap(g_codes)
    results.loc[group, "avg_overlap"] = avg_overlap
    results.loc[group, "members"] = ",".join(g_codes)
    results.loc[group, "n_members"] = len(g_codes)
    results.loc[group, "method"] = "family"

for group, members in df_w.groupby("Genus")["Glottocode"]:
    # filter out languages that are not in grambank
    g_codes = [m for m in members.tolist() if m in gb_by_lang]

    avg_overlap = calc_feature_val_overlap(g_codes)
    results.loc[group, "avg_overlap"] = avg_overlap
    results.loc[group, "members"] = ",".join(g_codes)
    results.loc[group, "n_members"] = len(g_codes)
    results.loc[group, "method"] = "genus"

In [57]:
results

Unnamed: 0,avg_overlap,members,n_members,method
Abkhaz-Adyge,0.632479,"abkh1244,kaba1278,ubyk1235",3.0,family
Afro-Asiatic,0.455744,"afad1236,afar1241,akka1240,alab1254,alag1248,a...",120.0,family
Ainu,0.000000,ainu1240,1.0,genus
Algic,0.614444,"arap1274,chey1247,chip1241,gros1243,loup1243,m...",16.0,family
Amto-Musan,0.000000,siaw1243,1.0,family
...,...,...,...,...
Yuruna,0.000000,,0.0,genus
Yámana,0.000000,yama1264,1.0,genus
Zapotecan,0.636610,"tata1258,sant1451,nopa1235,zoog1238,sanj1284,y...",9.0,genus
Zemeic,0.000000,zeme1240,1.0,genus


In [58]:
# Average GB feature value overlap within families or genera
alt.Chart(results[results['avg_overlap'] > 0]).mark_bar().encode(
    alt.X("avg_overlap:Q", bin=True),
    y='count()',
    color='method',
)

In [75]:
# Average GB feature value overlap within families or genera
ch = alt.Chart(
    results[(results["avg_overlap"] > 0) & (results["method"] == "family")].reset_index()
)

point = ch.mark_point(color='#DA7A77').encode(
    alt.X("index").sort("-y"),
    y="avg_overlap",
)
bar = ch.mark_bar(color='#134C86', opacity=0.5).encode(
    x=alt.X("index"),
    y="n_members",
)

alt.layer(point, bar).resolve_scale(y='independent')