In [1]:
import datasets

data = datasets.load_dataset("lecslab/glosslm", download_mode="force_redownload", verification_mode="no_checks")
data

Downloading readme:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/31.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/313900 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['transcription', 'glosses', 'translation', 'glottocode', 'id', 'source', 'metalang_glottocode', 'is_segmented'],
        num_rows: 451108
    })
})

In [2]:
# Map glottocodes to languages

from tqdm import tqdm
from pyglottolog import Glottolog
glottolog = Glottolog('../../glottolog')

all_glottocodes = set(data['train'].unique('glottocode'))
glottocode_mapping = dict()
for code in tqdm(all_glottocodes):
    languoid = glottolog.languoid(code) if code != '' and code is not None else None
    glottocode_mapping[code] = languoid.name if languoid else 'Unknown language'

glottocode_mapping

100%|██████████| 1786/1786 [11:03<00:00,  2.69it/s]


{'': 'Unknown language',
 'uraa1244': 'Ura',
 'midd1316': 'Middle French',
 'ghan1244': 'Ghanaian Pidgin English',
 'tsat1238': 'Tsat',
 'tedi1235': 'Tedim Chin',
 'bena1262': 'Bena (Tanzania)',
 'yawu1244': 'Yawuru',
 'pila1245': 'Pilagá',
 'bafu1246': 'Bafut',
 'turk1303': 'Khalaj Turkic',
 'gras1239': 'Grassland Mari',
 'otta1242': 'Ottawa',
 'east2346': 'Eastern Gorkha Tamang',
 'siwi1239': 'Siwi',
 'sain1246': 'Saint Lucian Creole French',
 'kitu1246': 'Kituba (Democratic Republic of Congo)',
 'oldp1257': 'Old Portuguese',
 'panj1256': 'Eastern Panjabi',
 'torr1259': 'Aro',
 'nsen1242': 'Nsenga',
 'beem1239': 'Beembe',
 'muru1266': 'Muruwari',
 'rovi1238': 'Roviana',
 'shix1238': 'Shixing',
 'abkh1244': 'Abkhaz',
 'shiw1234': 'Shiwa',
 'pany1241': 'Panytyima',
 'mala1544': 'Malabar-Sri Lanka Portuguese',
 'guru1261': 'Gurung',
 'pulu1242': 'Puluwatese',
 'guil1236': 'Güilá Zapotec',
 'jehh1245': 'Jeh',
 'nezp1238': 'Nez Perce',
 'khan1278': 'Khana',
 'yine1238': 'Yine',
 'nucl1302

In [3]:
def map_glottocodes(row):
    row['language'] = glottocode_mapping[row['glottocode']]
    row['metalang'] = glottocode_mapping[row['metalang_glottocode']]
    if row['metalang'] == 'Unknown language':
        row['metalang'] = ''
    return row

data = data.map(map_glottocodes)

Map:   0%|          | 0/451108 [00:00<?, ? examples/s]

In [4]:
data.push_to_hub('lecslab/glosslm')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/452 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/558 [00:00<?, ?B/s]

In [5]:
df = data['train'].to_pandas()

in_dist = ['arap1274', 'uspa1245', 'dido1241']
oo_dist = ['nyan1302', 'natu1246', 'lezg1247', 'gitx1241']

mask_in_dist_train = (df['source'] == 'sigmorphon_st') & df['glottocode'].isin(in_dist) & (df['id'].str.contains('train'))
mask_in_dist_eval = (df['source'] == 'sigmorphon_st') & df['glottocode'].isin(in_dist) & (df['id'].str.contains('dev'))
mask_in_dist_test = (df['source'] == 'sigmorphon_st') & df['glottocode'].isin(in_dist) & (df['id'].str.contains('test'))

mask_ood_dist_train = (df['glottocode'].isin(oo_dist) & ((df['id'].str.contains('train')) | (df['source'] != 'sigmorphon_st')))
mask_ood_dist_eval = df['glottocode'].isin(oo_dist) & (df['id'].str.contains('dev'))
mask_ood_dist_test = df['glottocode'].isin(oo_dist) & (df['id'].str.contains('test'))

mask_other_train = ~(mask_in_dist_train | mask_in_dist_eval | mask_in_dist_test | mask_ood_dist_train | mask_ood_dist_eval | mask_ood_dist_test)

split_dataset = datasets.DatasetDict()

split_dataset['train']     = datasets.Dataset.from_pandas(df[mask_in_dist_train | mask_other_train])
split_dataset['train_ID']  = datasets.Dataset.from_pandas(df[mask_in_dist_train])
split_dataset['eval_ID']   = datasets.Dataset.from_pandas(df[mask_in_dist_eval])
split_dataset['test_ID']   = datasets.Dataset.from_pandas(df[mask_in_dist_test])
split_dataset['train_OOD'] = datasets.Dataset.from_pandas(df[mask_ood_dist_train])
split_dataset['eval_OOD']  = datasets.Dataset.from_pandas(df[mask_ood_dist_eval])
split_dataset['test_OOD']  = datasets.Dataset.from_pandas(df[mask_ood_dist_test])

split_dataset = split_dataset.remove_columns(['__index_level_0__'])
split_dataset

DatasetDict({
    train: Dataset({
        features: ['transcription', 'glosses', 'translation', 'glottocode', 'id', 'source', 'metalang_glottocode', 'is_segmented', 'language', 'metalang'],
        num_rows: 418718
    })
    train_ID: Dataset({
        features: ['transcription', 'glosses', 'translation', 'glottocode', 'id', 'source', 'metalang_glottocode', 'is_segmented', 'language', 'metalang'],
        num_rows: 104928
    })
    eval_ID: Dataset({
        features: ['transcription', 'glosses', 'translation', 'glottocode', 'id', 'source', 'metalang_glottocode', 'is_segmented', 'language', 'metalang'],
        num_rows: 11138
    })
    test_ID: Dataset({
        features: ['transcription', 'glosses', 'translation', 'glottocode', 'id', 'source', 'metalang_glottocode', 'is_segmented', 'language', 'metalang'],
        num_rows: 11940
    })
    train_OOD: Dataset({
        features: ['transcription', 'glosses', 'translation', 'glottocode', 'id', 'source', 'metalang_glottocode', 'is_s

In [6]:
split_dataset.push_to_hub('lecslab/glosslm-split', commit_message='Create new splits')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/419 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

In [7]:
# Sanity check

split_dataset_total_rows = sum(len(dataset) for dataset in split_dataset.values()) - len(split_dataset['train_ID'])

if (split_dataset_total_rows != len(df)):
    print(f"Mismatch! {split_dataset_total_rows} in split and {len(df)} total")
else:
    print("Looks good :)")

Looks good :)


In [4]:
import datasets
data = datasets.load_dataset("lecslab/glosslm-split")

data['train_OOD'][200]

{'ID': 'st_train_lezg1247_95',
 'glottocode': 'lezg1247',
 'transcription': 'са эрменидин кьилел хер хьанава .',
 'glosses': 'one armenian-ERG-GEN head-INESS-SPSS wound happened-PERF .',
 'translation': 'And an armenian guy was wounded in his head.',
 'metalang_glottocode': 'stan1293',
 'is_segmented': 'no',
 'source': 'sigmorphon_st',
 'language': 'Lezgian',
 'metalang': 'English'}

In [7]:
data['train_OOD'][2000]

{'ID': 'st_train_natu1246_294',
 'glottocode': 'natu1246',
 'transcription': 'Mz nibrde sc tzwrkxtrpeng rlilrdr .',
 'glosses': 'PREP back.part-3MINII PFV RL.3AUG-baptised-COS-3AUGIS MID-two-APPL-3AUGII .',
 'translation': 'Afterward the two of them were baptised.',
 'metalang_glottocode': 'stan1293',
 'is_segmented': 'no',
 'source': 'sigmorphon_st',
 'language': 'Natügu',
 'metalang': 'English'}