In [13]:
import pandas as pd

examples = pd.read_csv("./imtvault/cldf/examples.csv")
examples = examples.drop(columns=['Primary_Text', 'Corpus_Reference', 'Abbreviations', 'Source', 'Contribution_ID', 'Comment', "Language_Name"])

examples['Analyzed_Word'] = examples['Analyzed_Word'] \
            .str.replace('\t', ' ') \
            .str.replace("?", " ?") \
            .str.replace(".", " .") \
            .str.replace("!", " !") \
            .str.replace(",", " ,") \
            .str.replace("[", " ") \
            .str.replace("]", " ") \
            .str.replace("- ", " ")

examples['Gloss'] = examples['Gloss'] \
            .str.replace("\t", " ") \
            .str.replace("[", " ") \
            .str.replace("]", " ") \
            .str.replace("- ", " ")

examples = examples.rename(columns={"Language_ID": "glottocode", 
                                    "Analyzed_Word": "transcription", 
                                    "Gloss": "glosses", 
                                    "Translated_Text": "translation",
                                    "Meta_Language_ID": "metalang_glottocode"})

# examples["is_segmented"] = examples["LGR_Conformance_Level"] == "LGRConformance.MORPHEME_ALIGNED"
examples["is_segmented"] = examples["transcription"].str.contains("-")
examples.loc[examples["is_segmented"] == False, "is_segmented"] = ""
examples.loc[examples["is_segmented"] == True, "is_segmented"] = "yes"

examples

Unnamed: 0,ID,glottocode,transcription,glosses,translation,metalang_glottocode,LGR_Conformance_Level,is_segmented
0,glossa4813-1,dogr1252,chekoa edı,child warm/feverish,*Chekoa edı.,stan1293,LGRConformance.MORPHEME_ALIGNED,
1,glossa4813-2,dogr1252,ı̨xę̀ę edı,yesterday warm/feverish,Yesterday was warm.’2 (MLBW 2009),stan1293,LGRConformance.MORPHEME_ALIGNED,
2,glossa4813-3,dogr1252,ı̨xę̀ę edı Ø-lı̨,yesterday warm/feverish IPFV.3.SBJ-COP.IPFV,(Intended: ‘Yesterday was warm.’) (MLBW 2009)3,stan1293,LGRConformance.MORPHEME_ALIGNED,yes
3,glossa4813-4,dogr1252,naxı sınì ya ts’e e Ø h tı,2PL.OBJ THM THM 1PL.SBJ CJ IPFV CLF speak.IPFV,We judge you.’ (MLBW 2012),stan1293,LGRConformance.WORD_ALIGNED,
4,glossa4813-5,dogr1252,naxı sınìya ts’eeh tı,2PL.OBJ THM IPFV1PL.SBJ judge.IPFV,We are judging you.’ (MLBW 2012),stan1293,LGRConformance.WORD_ALIGNED,
...,...,...,...,...,...,...,...,...
76440,langsci369-b18554990e,undefined,"Agua que no has de beber NP , déjala correr...",Water that no you-must to drink NP let-it f...,Don’t be the dog in the manger,stan1293,LGRConformance.WORD_ALIGNED,
76441,langsci369-e70871cdea,undefined,Dime VP cuanto tienes AdvP y Conj te di...,Tell-me VP how-much you-have AdvP and Con...,What you own determines your worth,stan1293,LGRConformance.WORD_ALIGNED,
76442,langsci369-3ca248f417,undefined,Como la gata flora: si se la ponen gritan y si...,Like the cat flora: if to-her it they-give the...,There’s no pleasing,stan1293,LGRConformance.WORD_ALIGNED,
76443,langsci369-fd90d1e862,undefined,"Como la gata flora , que si se la metes chilla...",Like the cat flora: that if to-her it you-put-...,There’s no pleasing,stan1293,LGRConformance.WORD_ALIGNED,


In [14]:
import langid
from tqdm.notebook import tqdm

tqdm.pandas()

iso_to_glotto = {
    "en": "stan1293",
    "fr": "stan1290",
    "de": "stan1295",
    "pt": "port1283",
    "zh": "mand1415",
    "es": "stan1288"
}

bad_transl = []

def compare_pred_lang(row):
    if not isinstance(row['translation'], str):
        bad_transl.append(row)
        row['metalang_glottocode'] = ''
        row['translation'] = ''
        return row
        
    pred_lang = langid.classify(row['translation'])[0]
    if pred_lang not in iso_to_glotto or iso_to_glotto[pred_lang] != row['metalang_glottocode']:
        # We have a mismatch! Clear everything and run
        bad_transl.append(row)
        row['metalang_glottocode'] = ''
        row['translation'] = ''
    return row
    

# Check if predicted lang matches metalang
examples = examples.progress_apply(compare_pred_lang, axis=1)

print(f"Found {len(bad_transl)} broken translation instances")
examples

  0%|          | 0/76445 [00:00<?, ?it/s]

Found 6347 broken translation instances


Unnamed: 0,ID,glottocode,transcription,glosses,translation,metalang_glottocode,LGR_Conformance_Level,is_segmented
0,glossa4813-1,dogr1252,chekoa edı,child warm/feverish,,,LGRConformance.MORPHEME_ALIGNED,
1,glossa4813-2,dogr1252,ı̨xę̀ę edı,yesterday warm/feverish,Yesterday was warm.’2 (MLBW 2009),stan1293,LGRConformance.MORPHEME_ALIGNED,
2,glossa4813-3,dogr1252,ı̨xę̀ę edı Ø-lı̨,yesterday warm/feverish IPFV.3.SBJ-COP.IPFV,(Intended: ‘Yesterday was warm.’) (MLBW 2009)3,stan1293,LGRConformance.MORPHEME_ALIGNED,yes
3,glossa4813-4,dogr1252,naxı sınì ya ts’e e Ø h tı,2PL.OBJ THM THM 1PL.SBJ CJ IPFV CLF speak.IPFV,,,LGRConformance.WORD_ALIGNED,
4,glossa4813-5,dogr1252,naxı sınìya ts’eeh tı,2PL.OBJ THM IPFV1PL.SBJ judge.IPFV,We are judging you.’ (MLBW 2012),stan1293,LGRConformance.WORD_ALIGNED,
...,...,...,...,...,...,...,...,...
76440,langsci369-b18554990e,undefined,"Agua que no has de beber NP , déjala correr...",Water that no you-must to drink NP let-it f...,Don’t be the dog in the manger,stan1293,LGRConformance.WORD_ALIGNED,
76441,langsci369-e70871cdea,undefined,Dime VP cuanto tienes AdvP y Conj te di...,Tell-me VP how-much you-have AdvP and Con...,What you own determines your worth,stan1293,LGRConformance.WORD_ALIGNED,
76442,langsci369-3ca248f417,undefined,Como la gata flora: si se la ponen gritan y si...,Like the cat flora: if to-her it they-give the...,There’s no pleasing,stan1293,LGRConformance.WORD_ALIGNED,
76443,langsci369-fd90d1e862,undefined,"Como la gata flora , que si se la metes chilla...",Like the cat flora: that if to-her it you-put-...,There’s no pleasing,stan1293,LGRConformance.WORD_ALIGNED,


In [15]:
unsegmented_examples = examples[examples['is_segmented'] == "yes"].copy()
unsegmented_examples["is_segmented"] = "no"
unsegmented_examples["transcription"] = unsegmented_examples["transcription"].str.replace("-", "")
unsegmented_examples

Unnamed: 0,ID,glottocode,transcription,glosses,translation,metalang_glottocode,LGR_Conformance_Level,is_segmented
2,glossa4813-3,dogr1252,ı̨xę̀ę edı Ølı̨,yesterday warm/feverish IPFV.3.SBJ-COP.IPFV,(Intended: ‘Yesterday was warm.’) (MLBW 2009)3,stan1293,LGRConformance.MORPHEME_ALIGNED,no
5,glossa4813-6,dogr1252,ı̨xę̀ę eya ı̨lè,yesterday sick/painful,,,LGRConformance.WORD_ALIGNED,no
7,glossa4813-8,dogr1252,neyatıì ehkw’ı ha hǫt’e,2SG-word-PNS correct FUT FOC,Your words must be correct.’ (MLBW 2012)13,stan1293,LGRConformance.MORPHEME_ALIGNED,no
8,glossa4813-9,dogr1252,etedeht’ı̨,THM--be.poor/pitiful.IPFV,I am poor.’ (MLBW 2009),stan1293,LGRConformance.WORD_ALIGNED,no
9,glossa4813-10,dogr1252,tł’àɂeh ełèaek’à,pants RECP-THM--wrinkle.,The pants are wrinkled.’ (TCSA 2007),stan1293,LGRConformance.WORD_ALIGNED,no
...,...,...,...,...,...,...,...,...
76394,langsci369-a0628860cd,undefined,Nu şia ajutat pe nimeni dintre ai săi .,NEG CL.3SG.DAT-has helped LOC=DOM nobody from ...,Intended: `He hasn't helped anybody of his.' *...,stan1293,LGRConformance.WORD_ALIGNED,no
76395,langsci369-d018caeb9c,undefined,Şii/*mii(l) ajută pe prieteni .,CL.3SG.REFL.DAT/1SG.DAT-CL.3M.SG.ACC help.3SG ...,Intended: `He is helping his own/my friend.,stan1293,LGRConformance.WORD_ALIGNED,no
76396,langsci369-d9f31a29fa,undefined,"Pe prietenii , Ion şiii ajută .","LOC=DOM friends, Ion CL.DAT.3SG.REFL-CL.3M.PL....","His own friends, Ion helps them.",stan1293,LGRConformance.WORD_ALIGNED,no
76397,langsci369-0687fce7d6,undefined,Nu şiia trimis pe nimeni*i în ajutori .,NEG CL.3SG.REFL.DAT-has sent LOC=DOM nobody in...,Lit. `He hasn't sent anybody to/as his own aid.,stan1293,LGRConformance.WORD_ALIGNED,no


In [16]:
all_examples = pd.concat([examples, unsegmented_examples])
all_examples["source"] = "imtvault"
all_examples.drop(columns=["LGR_Conformance_Level"], inplace=True)
all_examples.loc[all_examples['glottocode'] == 'undefined', 'glottocode'] = ''
all_examples

Unnamed: 0,ID,glottocode,transcription,glosses,translation,metalang_glottocode,is_segmented,source
0,glossa4813-1,dogr1252,chekoa edı,child warm/feverish,,,,imtvault
1,glossa4813-2,dogr1252,ı̨xę̀ę edı,yesterday warm/feverish,Yesterday was warm.’2 (MLBW 2009),stan1293,,imtvault
2,glossa4813-3,dogr1252,ı̨xę̀ę edı Ø-lı̨,yesterday warm/feverish IPFV.3.SBJ-COP.IPFV,(Intended: ‘Yesterday was warm.’) (MLBW 2009)3,stan1293,yes,imtvault
3,glossa4813-4,dogr1252,naxı sınì ya ts’e e Ø h tı,2PL.OBJ THM THM 1PL.SBJ CJ IPFV CLF speak.IPFV,,,,imtvault
4,glossa4813-5,dogr1252,naxı sınìya ts’eeh tı,2PL.OBJ THM IPFV1PL.SBJ judge.IPFV,We are judging you.’ (MLBW 2012),stan1293,,imtvault
...,...,...,...,...,...,...,...,...
76394,langsci369-a0628860cd,,Nu şia ajutat pe nimeni dintre ai săi .,NEG CL.3SG.DAT-has helped LOC=DOM nobody from ...,Intended: `He hasn't helped anybody of his.' *...,stan1293,no,imtvault
76395,langsci369-d018caeb9c,,Şii/*mii(l) ajută pe prieteni .,CL.3SG.REFL.DAT/1SG.DAT-CL.3M.SG.ACC help.3SG ...,Intended: `He is helping his own/my friend.,stan1293,no,imtvault
76396,langsci369-d9f31a29fa,,"Pe prietenii , Ion şiii ajută .","LOC=DOM friends, Ion CL.DAT.3SG.REFL-CL.3M.PL....","His own friends, Ion helps them.",stan1293,no,imtvault
76397,langsci369-0687fce7d6,,Nu şiia trimis pe nimeni*i în ajutori .,NEG CL.3SG.REFL.DAT-has sent LOC=DOM nobody in...,Lit. `He hasn't sent anybody to/as his own aid.,stan1293,no,imtvault


In [5]:
import datasets

# ds = datasets.Dataset.from_pandas(all_examples).remove_columns(["__index_level_0__"])
# ds

In [22]:
ds.push_to_hub("lecslab/glosslm")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/633 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


In [18]:
# run this cell to replace the data on HF
old_data = datasets.load_dataset("lecslab/glosslm")
old_data = old_data['train'].to_pandas()
old_data = old_data[old_data['source'] != "imtvault"]
a = pd.concat([old_data, all_examples])
ds = datasets.Dataset.from_pandas(a).remove_columns(["__index_level_0__"])
ds.push_to_hub("lecslab/glosslm")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/452 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]