In [39]:
import os
from st_data.data import IGTLine

def load_data_file(path: str):
    """Loads a file containing IGT data into a list of entries."""
    all_data = []

    # If we have a directory, recursively load all files and concat together
    if os.path.isdir(path):
        for file in os.listdir(path):
            if file.endswith(".txt"):
                all_data.extend(load_data_file(os.path.join(path, file)))
        return all_data

    # If we have one file, read in line by line
    with open(path, 'r') as file:
        current_entry = [None, None, None, None]  # transc, segm, gloss, transl

        skipped_lines = []
        
        for line in file:
            # Determine the type of line
            # If we see a type that has already been filled for the current entry, something is wrong
            line_prefix = line[:2]
            if line_prefix == '\\t' and current_entry[0] == None:
                current_entry[0] = line[3:].strip()
            elif line_prefix == '\\m' and current_entry[1] == None:
                current_entry[1] = line[3:].strip()
            elif line_prefix == '\\g' and current_entry[2] == None:
                if len(line[3:].strip()) > 0:
                    current_entry[2] = line[3:].strip()
            elif line_prefix == '\\l' and current_entry[3] == None:
                current_entry[3] = line[3:].strip()
                # Once we have the translation, we've reached the end and can save this entry
                all_data.append(IGTLine(transcription=current_entry[0],
                                        segmentation=current_entry[1],
                                        glosses=current_entry[2],
                                        translation=current_entry[3]))
                current_entry = [None, None, None, None]
            elif line_prefix == "\\p":
                # Skip POS lines
                continue
            elif line.strip() != "":
                # Something went wrong
                skipped_lines.append(line)
                continue
            else:
                if not current_entry == [None, None, None, None]:
                    all_data.append(IGTLine(transcription=current_entry[0],
                                            segmentation=current_entry[1],
                                            glosses=current_entry[2],
                                            translation=None))
                    current_entry = [None, None, None, None]
        # Might have one extra line at the end
        if not current_entry == [None, None, None, None]:
            all_data.append({"transcr"})
            all_data.append(IGTLine(transcription=current_entry[0],
                                    segmentation=current_entry[1],
                                    glosses=current_entry[2],
                                    translation=None))
        print(f"Skipped {len(skipped_lines)} lines")
    return all_data

In [18]:
import datasets

glosslm = datasets.load_dataset("lecslab/glosslm")

Downloading readme:   0%|          | 0.00/633 [00:00<?, ?B/s]

Using custom data configuration lecslab--glosslm-cc23d567cd684e4f


Downloading and preparing dataset None/None to /Users/milesper/.cache/huggingface/datasets/lecslab___parquet/lecslab--glosslm-cc23d567cd684e4f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/12.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/116339 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /Users/milesper/.cache/huggingface/datasets/lecslab___parquet/lecslab--glosslm-cc23d567cd684e4f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [46]:
def create_hf_dataset(filename, glottocode, metalang):
    print(f"Loading {filename}")
    raw_data = load_data_file(filename)
    data = []
    for i, line in enumerate(raw_data):
        new_row = {'glottocode': glottocode, 'metalang_glottocode': metalang, "is_segmented": "yes", "source": "sigmorphon_st"}
        new_row['ID'] = f"st_{glottocode}_{i}"
        new_row['transcription'] = line.segmentation
        new_row['glosses'] = line.glosses
        new_row['translation'] = line.translation
        data.append(new_row)

        new_row_unsegmented = {'glottocode': glottocode, 'metalang_glottocode': metalang, "is_segmented": "no", "source": "sigmorphon_st"}
        new_row_unsegmented['ID'] = f"st_{glottocode}_{i}_unseg"
        new_row_unsegmented['transcription'] = line.transcription
        new_row_unsegmented['glosses'] = line.glosses
        new_row_unsegmented['translation'] = line.translation
        data.append(new_row_unsegmented)

    return datasets.Dataset.from_list(data)

st_data = {
    "arp": create_hf_dataset("./st_data/Arapaho/st_data/arp-CLDFmaster.txt", "arap1274", "stan1293"),
    "git": create_hf_dataset("./st_data/Gitksan/st_data", "gitx1241", "stan1293"),
    "lez": create_hf_dataset("./st_data/Lezgi/st_data/lez-CLDFmaster.txt", "lezg1247", "stan1293"),
    "nat": create_hf_dataset("./st_data/Natugu/st_data/ntu.txt", "natu1246", "stan1293"),
    "nyb": create_hf_dataset("./st_data/Nyangbo/st_data/nyb.txt", "nyan1302", "stan1293"),
    "ddo": create_hf_dataset("./st_data/Tsez/st_data/ddo-CLDFmaster.txt", "dido1241", "stan1293"),
    "usp": create_hf_dataset("./st_data/Uspanteko/st_data", "uspa1245", "stan1288"),
}

Loading ./st_data/Arapaho/st_data/arp-CLDFmaster.txt
Skipped 0 lines
Loading ./st_data/Gitksan/st_data
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ./st_data/Lezgi/st_data/lez-CLDFmaster.txt
Skipped 0 lines
Loading ./st_data/Natugu/st_data/ntu.txt
Skipped 0 lines
Loading ./st_data/Nyangbo/st_data/nyb.txt
Skipped 0 lines
Loading ./st_data/Tsez/st_data/ddo-CLDFmaster.txt
Skipped 0 lines
Loading ./st_data/Uspanteko/st_data
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines


In [41]:
st_data

{'arp': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'ID', 'transcription', 'glosses', 'translation'],
     num_rows: 97832
 }),
 'git': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'ID', 'transcription', 'glosses', 'translation'],
     num_rows: 220
 }),
 'lez': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'ID', 'transcription', 'glosses', 'translation'],
     num_rows: 1752
 }),
 'nat': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'ID', 'transcription', 'glosses', 'translation'],
     num_rows: 1978
 }),
 'nyb': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'ID', 'transcription', 'glosses', 'translation'],
     num_rows: 5252
 }),
 'ddo': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'ID', 'transcription', 'glosses', 'translat

In [54]:
combined = datasets.concatenate_datasets([glosslm['train']] + list(st_data.values()))
combined.push_to_hub("lecslab/glosslm")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Updating downloaded metadata with the new split.
