In [22]:
from huggingface_hub import login
login()

Token is valid.
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/lindiatjuatja/.cache/huggingface/token
Login successful


In [25]:
import os
from st_data.data import IGTLine

def load_data_file(path: str):
    """Loads a file containing IGT data into a list of entries."""
    all_data = []

    # If we have a directory, recursively load all files and concat together
    if os.path.isdir(path):
        for file in os.listdir(path):
            if file.endswith(".txt"):
                all_data.extend(load_data_file(os.path.join(path, file)))
        return all_data

    # If we have one file, read in line by line
    with open(path, 'r') as file:
        current_entry = [None, None, None, None]  # transc, segm, gloss, transl

        skipped_lines = []
        
        for line in file:
            # Determine the type of line
            # If we see a type that has already been filled for the current entry, something is wrong
            line_prefix = line[:2]
            if line_prefix == '\\t' and current_entry[0] == None:
                current_entry[0] = line[3:].strip()
            elif line_prefix == '\\m' and current_entry[1] == None:
                current_entry[1] = line[3:].strip()
            elif line_prefix == '\\g' and current_entry[2] == None:
                if len(line[3:].strip()) > 0:
                    current_entry[2] = line[3:].strip()
            elif line_prefix == '\\l' and current_entry[3] == None:
                current_entry[3] = line[3:].strip()
                # Once we have the translation, we've reached the end and can save this entry
                all_data.append(IGTLine(transcription=current_entry[0],
                                        segmentation=current_entry[1],
                                        glosses=current_entry[2],
                                        translation=current_entry[3]))
                current_entry = [None, None, None, None]
            elif line_prefix == "\\p":
                # Skip POS lines
                continue
            elif line.strip() != "":
                # Something went wrong
                skipped_lines.append(line)
                continue
            else:
                if not current_entry == [None, None, None, None]:
                    all_data.append(IGTLine(transcription=current_entry[0],
                                            segmentation=current_entry[1],
                                            glosses=current_entry[2],
                                            translation=None))
                    current_entry = [None, None, None, None]
        # Might have one extra line at the end
        if not current_entry == [None, None, None, None]:
            all_data.append({"transcr"})
            all_data.append(IGTLine(transcription=current_entry[0],
                                    segmentation=current_entry[1],
                                    glosses=current_entry[2],
                                    translation=None))
        # print(f"Skipped {len(skipped_lines)} lines")
    return all_data

In [48]:
import json
from iso639 import Lang
# import langid
from langid.langid import LanguageIdentifier, model

identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

def get_code_mapping(path: str):
    mapping = {}
    f = open(path)
    data = json.load(f)
    for i in data['resources']:
        for identifier in i['identifiers']:
            iso_id = identifier['identifier']
            mapping[iso_id] = i['id']
    f.close()
    return mapping

def get_trans_lang_code(translation: str, mapping: dict[str, str]):
    # pred_lang = langid.classify(translation)
    pred_lang = identifier.classify(translation)
    trans_lang_code = ''
    # if (exp(pred_lang[1]) > 0.95):
    if (pred_lang[1] > 0.95):
        pred_code = pred_lang[0]
        trans_lang = Lang(pred_code).pt3
        if trans_lang in mapping.keys():
            trans_lang_code = mapping[trans_lang]
    return trans_lang_code

In [52]:
import datasets
from tqdm.notebook import tqdm

mapping = get_code_mapping('../resourcemap.json')
print(mapping)

data = []
for file in tqdm(os.listdir('../data/odin_data_reformat')):
    filename = os.path.join('../data/odin_data_reformat', file)
    raw_data = load_data_file(filename)
    odin_fileid = file.split('.')[0]
    odin_lang_code = odin_fileid.split('-')[0]
    glottocode = ''
    if odin_lang_code in mapping.keys():
        glottocode = mapping[odin_lang_code]
    for i, line in enumerate(raw_data):
        translation = line.translation
        if translation != '':
            metalang_code = get_trans_lang_code(translation, mapping)
            # print(metalang_code)
        else:
            metalang_code = ''
        new_row = {'glottocode': glottocode, 'metalang_glottocode': metalang_code, "source": "odin"}
        if line.segmentation != '':
            is_segmented = 'no'
            new_row['ID'] = f"odin_{odin_fileid}_{i}_unseg"
            new_row['transcription'] = line.transcription
        else:
            is_segmented = 'yes'
            new_row['ID'] = f"odin_{odin_fileid}_{i}"
            new_row['transcription'] = line.segmentation
        new_row['glosses'] = line.glosses
        new_row['translation'] = line.translation
        new_row['is_segmented'] = is_segmented
        data.append(new_row) 
        if is_segmented == 'yes':
            new_row['transcription'] = line.transcription
            new_row['ID'] = f"odin_{odin_fileid}_{i}_unseg"
            new_row['is_segmented'] = 'no'
            data.append(new_row)

{'aiz': 'aari1238', 'aiw': 'aari1239', 'aay': 'aari1240', 'aas': 'aasa1238', 'kbt': 'abad1241', 'abg': 'abag1245', 'abf': 'abai1240', 'abm': 'aban1242', 'mij': 'abar1238', 'aau': 'abau1245', 'abq': 'abaz1241', 'aba': 'abee1242', 'abp': 'aben1249', 'abi': 'abid1235', 'bsa': 'abin1243', 'axb': 'abip1241', 'ash': 'abis1238', 'abk': 'abkh1244', 'aob': 'abom1238', 'abo': 'abon1238', 'abr': 'abro1238', 'abn': 'abua1244', 'aah': 'abua1245', 'abz': 'abui1241', 'kgr': 'abun1252', 'abu': 'abur1243', 'mgj': 'abur1244', 'ado': 'abuu1241', 'tpx': 'acat1239', 'acn': 'acha1249', 'aca': 'acha1250', 'yif': 'ache1244', 'acz': 'ache1245', 'guq': 'ache1246', 'acr': 'achi1256', 'ace': 'achi1257', 'acc': 'achi1258', 'act': 'acht1238', 'acv': 'achu1247', 'acu': 'achu1248', 'ach': 'acol1236', 'acs': 'acro1239', 'adb': 'adab1235', 'xad': 'adai1235', 'ads': 'adam1238', 'fub': 'adam1253', 'ada': 'adan1247', 'adq': 'adan1248', 'adn': 'adan1251', 'adp': 'adap1234', 'tiu': 'adas1235', 'ade': 'adel1244', 'adh': 'adh

  0%|          | 0/1195 [00:00<?, ?it/s]

In [55]:
glosslm = datasets.load_dataset("lecslab/glosslm", revision='6bc027f9910047159136f709aafebd835d6af8e5', ignore_verifications=True)
dataset = datasets.Dataset.from_list(data)
odin_data = {"odin": dataset}

print(glosslm)
print(odin_data)

combined = datasets.concatenate_datasets([glosslm['train']] + list(odin_data.values()))



Downloading and preparing dataset None/None to /Users/lindiatjuatja/.cache/huggingface/datasets/lecslab___parquet/lecslab--glosslm-2ada00dfe22d3c5c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/116339 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /Users/lindiatjuatja/.cache/huggingface/datasets/lecslab___parquet/lecslab--glosslm-2ada00dfe22d3c5c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'glottocode', 'transcription', 'glosses', 'translation', 'metalang_glottocode', 'is_segmented', 'source'],
        num_rows: 253547
    })
})
{'odin': Dataset({
    features: ['glottocode', 'metalang_glottocode', 'source', 'ID', 'transcription', 'glosses', 'translation', 'is_segmented'],
    num_rows: 197860
})}


In [57]:
print(combined[-1])

{'ID': 'odin_rcf_9_unseg', 'glottocode': 'reun1238', 'transcription': 'Met lapay asam zanimo.', 'glosses': 'give hey together animals', 'translation': 'Give the animals hey.', 'metalang_glottocode': 'stan1293', 'is_segmented': 'no', 'source': 'odin'}


In [58]:
combined.push_to_hub("lecslab/glosslm")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/452 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Updating downloaded metadata with the new split.


In [71]:
df = combined.to_pandas()
df_odin = df[df['source'] == 'odin']
df_odin_trans_lang = df_odin[df_odin['metalang_glottocode'] != '']
print(len(df_odin_trans_lang['ID'].unique())/len(df_odin['ID'].unique()))

0.8705248894594209
