In [1]:
import os
from st_data.data import IGTLine

def load_data_file(path: str):
    """Loads a file containing IGT data into a list of entries."""
    all_data = []

    # If we have a directory, recursively load all files and concat together
    if os.path.isdir(path):
        for file in os.listdir(path):
            if file.endswith(".txt"):
                all_data.extend(load_data_file(os.path.join(path, file)))
        return all_data

    # If we have one file, read in line by line
    with open(path, 'r') as file:
        current_entry = [None, None, None, None]  # transc, segm, gloss, transl

        skipped_lines = []
        
        for line in file:
            # Determine the type of line
            # If we see a type that has already been filled for the current entry, something is wrong
            line_prefix = line[:2]
            if line_prefix == '\\t' and current_entry[0] == None:
                current_entry[0] = line[3:].strip()
            elif line_prefix == '\\m' and current_entry[1] == None:
                current_entry[1] = line[3:].strip()
            elif line_prefix == '\\g' and current_entry[2] == None:
                if len(line[3:].strip()) > 0:
                    current_entry[2] = line[3:].strip()
            elif line_prefix == '\\l' and current_entry[3] == None:
                current_entry[3] = line[3:].strip()
                # Once we have the translation, we've reached the end and can save this entry
                all_data.append(IGTLine(transcription=current_entry[0],
                                        segmentation=current_entry[1],
                                        glosses=current_entry[2],
                                        translation=current_entry[3]))
                current_entry = [None, None, None, None]
            elif line_prefix == "\\p":
                # Skip POS lines
                continue
            elif line.strip() != "":
                # Something went wrong
                skipped_lines.append(line)
                continue
            else:
                if not current_entry == [None, None, None, None]:
                    all_data.append(IGTLine(transcription=current_entry[0],
                                            segmentation=current_entry[1],
                                            glosses=current_entry[2],
                                            translation=None))
                    current_entry = [None, None, None, None]
        # Might have one extra line at the end
        if not current_entry == [None, None, None, None]:
            all_data.append(IGTLine(transcription=current_entry[0],
                                    segmentation=current_entry[1],
                                    glosses=current_entry[2],
                                    translation=None))
        print(f"Skipped {len(skipped_lines)} lines")
    return all_data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import datasets

glosslm = datasets.load_dataset("lecslab/glosslm", download_mode='force_redownload')

Downloading readme: 100%|██████████| 752/752 [00:00<00:00, 4.13MB/s]
Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|          | 0.00/28.0M [00:00<?, ?B/s][A
Downloading data:  15%|█▍        | 4.19M/28.0M [00:00<00:04, 4.92MB/s][A
Downloading data:  45%|████▍     | 12.6M/28.0M [00:01<00:01, 9.35MB/s][A
Downloading data:  75%|███████▍  | 21.0M/28.0M [00:02<00:00, 11.5MB/s][A
Downloading data: 100%|██████████| 28.0M/28.0M [00:02<00:00, 12.5MB/s][A
Downloading data files: 100%|██████████| 1/1 [00:02<00:00,  2.25s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 623.78it/s]
Generating train split: 100%|██████████| 425020/425020 [00:00<00:00, 2740366.30 examples/s]


In [5]:
def create_hf_dataset(filename, isocode, glottocode, metalang):
    print(f"Loading {filename}")
    train_data = load_data_file(filename + f"/{isocode}-train-track2-uncovered")
    dev_data = load_data_file(filename + f"/{isocode}-dev-track2-uncovered")
    test_data = load_data_file(filename + f"/{isocode}-test-track2-uncovered")
    
    def parse_data(raw_data, id_prefix: str):
        data = []
        for i, line in enumerate(raw_data):
            new_row = {'glottocode': glottocode, 'metalang_glottocode': metalang, "is_segmented": "yes", "source": "sigmorphon_st", "type": "canonical"}
            new_row['ID'] = f"st_{id_prefix}_{glottocode}_{i}"
            new_row['transcription'] = line.segmentation
            new_row['glosses'] = line.glosses
            new_row['translation'] = line.translation
            data.append(new_row)

            new_row_unsegmented = {'glottocode': glottocode, 'metalang_glottocode': metalang, "is_segmented": "no", "source": "sigmorphon_st", "type": "canonical"}
            new_row_unsegmented['ID'] = f"st_{id_prefix}_{glottocode}_{i}"
            new_row_unsegmented['transcription'] = line.transcription
            new_row_unsegmented['glosses'] = line.glosses
            new_row_unsegmented['translation'] = line.translation
            data.append(new_row_unsegmented)
        return data
    
    data = parse_data(train_data, 'train') + parse_data(dev_data, 'dev') + parse_data(test_data, 'test')

    return datasets.Dataset.from_list(data)

st_data = {
    "arp": create_hf_dataset("./st_data/splits/Arapaho", "arp", "arap1274", "stan1293"),
    "git": create_hf_dataset("./st_data/splits/Gitksan", "git", "gitx1241", "stan1293"),
    "lez": create_hf_dataset("./st_data/splits/Lezgi", "lez", "lezg1247", "stan1293"),
    "nat": create_hf_dataset("./st_data/splits/Natugu", "ntu", "natu1246", "stan1293"),
    "nyb": create_hf_dataset("./st_data/splits/Nyangbo", "nyb", "nyan1302", "stan1293"),
    "ddo": create_hf_dataset("./st_data/splits/Tsez", "ddo", "dido1241", "stan1293"),
    "usp": create_hf_dataset("./st_data/splits/Uspanteko", "usp", "uspa1245", "stan1288"),
}

Loading ./st_data/splits/Arapaho
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ./st_data/splits/Gitksan
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ./st_data/splits/Lezgi
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ./st_data/splits/Natugu
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ./st_data/splits/Nyangbo
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ./st_data/splits/Tsez
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ./st_data/splits/Uspanteko
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines


In [22]:
st_data

{'arp': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'type', 'ID', 'transcription', 'glosses', 'translation'],
     num_rows: 97832
 }),
 'git': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'type', 'ID', 'transcription', 'glosses', 'translation'],
     num_rows: 220
 }),
 'lez': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'type', 'ID', 'transcription', 'glosses', 'translation'],
     num_rows: 1752
 }),
 'nat': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'type', 'ID', 'transcription', 'glosses', 'translation'],
     num_rows: 1978
 }),
 'nyb': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'type', 'ID', 'transcription', 'glosses', 'translation'],
     num_rows: 5252
 }),
 'ddo': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'ty

In [9]:
# combined = datasets.concatenate_datasets([glosslm['train']] + list(st_data.values()))
# combined.push_to_hub("lecslab/glosslm")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Updating downloaded metadata with the new split.


In [7]:
import pandas as pd

# Replace data on hf

st_data_rows = datasets.concatenate_datasets(list(st_data.values())).to_pandas()

old_data = glosslm['train'].to_pandas()
old_data = old_data[old_data['source'] != "sigmorphon_st"]
a = pd.concat([old_data, st_data_rows])
ds = datasets.Dataset.from_pandas(a).remove_columns(["__index_level_0__"])
ds.push_to_hub("lecslab/glosslm", commit_message='Fix ST data')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|          | 0/426 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:  23%|██▎       | 100/426 [00:00<00:00, 999.02ba/s][A
Creating parquet from Arrow format:  58%|█████▊    | 246/426 [00:00<00:00, 1268.54ba/s][A
Creating parquet from Arrow format: 100%|██████████| 426/426 [00:00<00:00, 1271.09ba/s][A
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.21s/it]
README.md: 100%|██████████| 752/752 [00:00<00:00, 2.68MB/s]
