# ATDS vs. SpeechBrain vs. lang2vec

In [1]:
import torch
import pandas as pd

from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


## Compute similarities according to different measures

Load in relative word error rates (first column in paper table)

In [2]:
indic_cpt_werrs = pd.read_csv("/workspace/data/artefacts/ATDS/indic_werr.csv")

indic_cpt_werrs.target_lang = indic_cpt_werrs.target_lang.str.lower()

indic_cpt_werrs["donor_lang"] = indic_cpt_werrs \
    .CPT_data.str.replace("punjabi-10h_", "") \
    .str.replace("-60h-seed-\d", "", regex=True)

indic_cpt_werrs = indic_cpt_werrs[["target_lang", "donor_lang", "CPT_data", "percent_werr"]]

indic_cpt_werrs_medians = indic_cpt_werrs \
    .groupby(['target_lang', 'donor_lang']).agg({'percent_werr':'median'}) \
    .reset_index()

indic_cpt_werrs_medians

Unnamed: 0,target_lang,donor_lang,percent_werr
0,punjabi,bengali,-0.8
1,punjabi,gujarati,2.4
2,punjabi,hindi,6.0
3,punjabi,malayalam,-0.4
4,punjabi,marathi,1.6
5,punjabi,odia,0.0
6,punjabi,tamil,-0.4
7,punjabi,urdu,2.4


Add ATDS similarities and sort by ATDS

In [3]:
indic_atds_sims = pd.read_csv("/workspace/data/artefacts/ATDS/indic_atds.csv")

indic_cpt_werrs_medians = indic_cpt_werrs_medians.merge(indic_atds_sims, how='left') \
    .sort_values('atds', ascending=False)

indic_cpt_werrs_medians

Unnamed: 0,target_lang,donor_lang,percent_werr,atds
2,punjabi,hindi,6.0,0.96
1,punjabi,gujarati,2.4,0.93
7,punjabi,urdu,2.4,0.93
4,punjabi,marathi,1.6,0.92
0,punjabi,bengali,-0.8,0.9
3,punjabi,malayalam,-0.4,0.89
5,punjabi,odia,0.0,0.87
6,punjabi,tamil,-0.4,0.86


Add SpeechBrain similarities

In [4]:
def compute_speechbrain_similarities():
    langs = []
    embs = []
    for path in list(Path('/workspace/data/artefacts/ATDS/embeddings_speechbrain-lang-id').glob('*.pt')):
        lang = path.stem.split('_')[0]
        emb = torch.load(path)
        emb = emb.mean(0)
        embs.append(emb)
        langs.append(lang)
        
    embs = torch.stack(embs)
    embs = torch.nn.functional.normalize(embs, dim=1)
    sims = embs @ embs.T
    records = []
    for i in range(len(langs)):
        for j in range(len(langs)):
            records.append((langs[i], langs[j], sims[i, j].item()))
    df = pd.DataFrame(records, columns=['target_lang', 'donor_lang', 'SB_sim'])
    df["SB_sim"]=df["SB_sim"].apply(lambda x: round(x, 2))
    
    df.to_csv('/workspace/data/artefacts/ATDS/indic_speechbrain-sims.csv', index=False)
    return df

sb_sims = compute_speechbrain_similarities()

In [5]:
indic_cpt_werrs_medians = indic_cpt_werrs_medians.merge(sb_sims, how='left')

indic_cpt_werrs_medians

Unnamed: 0,target_lang,donor_lang,percent_werr,atds,SB_sim
0,punjabi,hindi,6.0,0.96,0.96
1,punjabi,gujarati,2.4,0.93,0.82
2,punjabi,urdu,2.4,0.93,0.88
3,punjabi,marathi,1.6,0.92,0.89
4,punjabi,bengali,-0.8,0.9,0.81
5,punjabi,malayalam,-0.4,0.89,0.83
6,punjabi,odia,0.0,0.87,0.71
7,punjabi,tamil,-0.4,0.86,0.76


Add lang2vec similarities

In [6]:
def add_lang2vec_sims(df):
    import lang2vec.lang2vec as l2v
    
    language_codes = {
        'tamil': 'tam',
        'malayalam': 'mal',
        'urdu': 'urd',
        'gujarati': 'guj',
        'odia': 'ori',
        'marathi': 'mar',
        'bengali': 'ben',
        'punjabi': 'pan',
        'hindi': 'hin'
    }

    lang2vec_funcs = [
        l2v.syntactic_distance,
        l2v.geographic_distance,
        l2v.featural_distance,
        l2v.inventory_distance,
        l2v.genetic_distance,
        l2v.phonological_distance
    ]
    
    for lang2vec_func in lang2vec_funcs:
        df[lang2vec_func.__name__] = df.apply(lambda row: 1 - lang2vec_func(language_codes[row.target_lang], language_codes[row.donor_lang]), axis=1)
        df[lang2vec_func.__name__] = df[lang2vec_func.__name__].apply(lambda x: round(x, 2))
        
    return df

indic_cpt_werrs_medians = add_lang2vec_sims(indic_cpt_werrs_medians)
indic_cpt_werrs_medians.columns = [ c.replace('_distance', '_sim') for c in indic_cpt_werrs_medians.columns ]

indic_cpt_werrs_medians

Unnamed: 0,target_lang,donor_lang,percent_werr,atds,SB_sim,syntactic_sim,geographic_sim,featural_sim,inventory_sim,genetic_sim,phonological_sim
0,punjabi,hindi,6.0,0.96,0.96,0.67,1.0,0.6,0.67,0.38,0.41
1,punjabi,gujarati,2.4,0.93,0.82,0.46,1.0,0.6,0.72,0.43,1.0
2,punjabi,urdu,2.4,0.93,0.88,0.51,0.9,0.6,0.67,0.43,1.0
3,punjabi,marathi,1.6,0.92,0.89,0.47,0.9,0.6,0.65,0.43,1.0
4,punjabi,bengali,-0.8,0.9,0.81,0.47,0.9,0.5,0.66,0.38,0.38
5,punjabi,malayalam,-0.4,0.89,0.83,0.32,0.9,0.5,0.64,0.0,1.0
6,punjabi,odia,0.0,0.87,0.71,0.32,0.9,0.5,0.65,0.43,1.0
7,punjabi,tamil,-0.4,0.86,0.76,0.47,0.9,0.5,0.59,0.0,1.0


## Calculate correlations

In [7]:
from scipy import stats

corr_df = indic_cpt_werrs.merge(indic_cpt_werrs_medians.drop(columns='percent_werr'), how='left')

corr_df

Unnamed: 0,target_lang,donor_lang,CPT_data,percent_werr,atds,SB_sim,syntactic_sim,geographic_sim,featural_sim,inventory_sim,genetic_sim,phonological_sim
0,punjabi,bengali,punjabi-10h_bengali-60h-seed-1,-0.4,0.9,0.81,0.47,0.9,0.5,0.66,0.38,0.38
1,punjabi,bengali,punjabi-10h_bengali-60h-seed-2,-0.8,0.9,0.81,0.47,0.9,0.5,0.66,0.38,0.38
2,punjabi,bengali,punjabi-10h_bengali-60h-seed-3,-0.8,0.9,0.81,0.47,0.9,0.5,0.66,0.38,0.38
3,punjabi,gujarati,punjabi-10h_gujarati-60h-seed-1,2.4,0.93,0.82,0.46,1.0,0.6,0.72,0.43,1.0
4,punjabi,gujarati,punjabi-10h_gujarati-60h-seed-2,2.4,0.93,0.82,0.46,1.0,0.6,0.72,0.43,1.0
5,punjabi,gujarati,punjabi-10h_gujarati-60h-seed-3,3.2,0.93,0.82,0.46,1.0,0.6,0.72,0.43,1.0
6,punjabi,hindi,punjabi-10h_hindi-60h-seed-1,6.4,0.96,0.96,0.67,1.0,0.6,0.67,0.38,0.41
7,punjabi,hindi,punjabi-10h_hindi-60h-seed-2,4.8,0.96,0.96,0.67,1.0,0.6,0.67,0.38,0.41
8,punjabi,hindi,punjabi-10h_hindi-60h-seed-3,6.0,0.96,0.96,0.67,1.0,0.6,0.67,0.38,0.41
9,punjabi,malayalam,punjabi-10h_malayalam-60h-seed-1,-0.4,0.89,0.83,0.32,0.9,0.5,0.64,0.0,1.0


In [8]:
records = []
for column in [col for col in corr_df.columns if col.endswith('similarity') or col.endswith('sim') or col == 'atds']:
    r = stats.pearsonr(x=corr_df[column], y=corr_df.percent_werr)
    records.append((column, round(r[0], 2)))

corr_results_df = pd.DataFrame.from_records(records, columns=['column', 'r']).sort_values(by='r', ascending=False)

corr_results_df

Unnamed: 0,column,r
0,atds,0.89
4,featural_sim,0.83
2,syntactic_sim,0.79
1,SB_sim,0.78
3,geographic_sim,0.77
5,inventory_sim,0.55
6,genetic_sim,0.48
7,phonological_sim,-0.31


Add correlations to bottom row to match table report in paper

In [18]:
indic_cpt_werrs_medians.append(dict([ (r.column, r.r) for i, r in corr_results_df.iterrows() ]), ignore_index=True)

Unnamed: 0,target_lang,donor_lang,percent_werr,atds,SB_sim,syntactic_sim,geographic_sim,featural_sim,inventory_sim,genetic_sim,phonological_sim
0,punjabi,hindi,6.0,0.96,0.96,0.67,1.0,0.6,0.67,0.38,0.41
1,punjabi,gujarati,2.4,0.93,0.82,0.46,1.0,0.6,0.72,0.43,1.0
2,punjabi,urdu,2.4,0.93,0.88,0.51,0.9,0.6,0.67,0.43,1.0
3,punjabi,marathi,1.6,0.92,0.89,0.47,0.9,0.6,0.65,0.43,1.0
4,punjabi,bengali,-0.8,0.9,0.81,0.47,0.9,0.5,0.66,0.38,0.38
5,punjabi,malayalam,-0.4,0.89,0.83,0.32,0.9,0.5,0.64,0.0,1.0
6,punjabi,odia,0.0,0.87,0.71,0.32,0.9,0.5,0.65,0.43,1.0
7,punjabi,tamil,-0.4,0.86,0.76,0.47,0.9,0.5,0.59,0.0,1.0
8,,,,0.89,0.78,0.79,0.77,0.83,0.55,0.48,-0.31
