In [2]:
import os
from data import IGTLine

def load_data_file(path: str):
    """Loads a file containing IGT data into a list of entries."""
    all_data = []

    # If we have a directory, recursively load all files and concat together
    if os.path.isdir(path):
        for file in os.listdir(path):
            if file.endswith(".txt"):
                all_data.extend(load_data_file(os.path.join(path, file)))
        return all_data

    # If we have one file, read in line by line
    with open(path, 'r') as file:
        current_entry = [None, None, None, None]  # transc, segm, gloss, transl

        skipped_lines = []
        
        for line in file:
            # Determine the type of line
            # If we see a type that has already been filled for the current entry, something is wrong
            line_prefix = line[:2]
            if line_prefix == '\\t' and current_entry[0] == None:
                current_entry[0] = line[3:].strip()
            elif line_prefix == '\\m' and current_entry[1] == None:
                current_entry[1] = line[3:].strip()
            elif line_prefix == '\\g' and current_entry[2] == None:
                if len(line[3:].strip()) > 0:
                    current_entry[2] = line[3:].strip()
            elif line_prefix == '\\l' and current_entry[3] == None:
                current_entry[3] = line[3:].strip()
                # Once we have the translation, we've reached the end and can save this entry
                all_data.append(IGTLine(transcription=current_entry[0],
                                        segmentation=current_entry[1],
                                        glosses=current_entry[2],
                                        translation=current_entry[3]))
                current_entry = [None, None, None, None]
            elif line_prefix == "\\p":
                # Skip POS lines
                continue
            elif line.strip() != "":
                # Something went wrong
                skipped_lines.append(line)
                continue
            else:
                if not current_entry == [None, None, None, None]:
                    all_data.append(IGTLine(transcription=current_entry[0],
                                            segmentation=current_entry[1],
                                            glosses=current_entry[2],
                                            translation=None))
                    current_entry = [None, None, None, None]
        # Might have one extra line at the end
        if not current_entry == [None, None, None, None]:
            all_data.append(IGTLine(transcription=current_entry[0],
                                    segmentation=current_entry[1],
                                    glosses=current_entry[2],
                                    translation=None))
        print(f"Skipped {len(skipped_lines)} lines")
    return all_data

In [3]:
import datasets

glosslm = datasets.load_dataset("lecslab/glosslm", download_mode='force_redownload', verification_mode="no_checks")

Downloading readme:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/312294 [00:00<?, ? examples/s]

In [4]:
def create_hf_dataset(filename, isocode, glottocode, metalang):
    print(f"Loading {filename}")
    train_data = load_data_file(filename + f"/{isocode}-train-track2-uncovered")
    dev_data = load_data_file(filename + f"/{isocode}-dev-track2-uncovered")
    test_data = load_data_file(filename + f"/{isocode}-test-track2-uncovered")
    
    def parse_data(raw_data, id_prefix: str):
        data = []
        for i, line in enumerate(raw_data):
            new_row = {'glottocode': glottocode, 'metalang_glottocode': metalang, "is_segmented": "yes", "source": "sigmorphon_st"}
            new_row['id'] = f"st_{id_prefix}_{glottocode}_{i}"
            new_row['transcription'] = line.segmentation
            new_row['glosses'] = line.glosses
            new_row['translation'] = line.translation
            data.append(new_row)

            new_row_unsegmented = {'glottocode': glottocode, 'metalang_glottocode': metalang, "is_segmented": "no", "source": "sigmorphon_st"}
            new_row_unsegmented['id'] = f"st_{id_prefix}_{glottocode}_{i}"
            new_row_unsegmented['transcription'] = line.transcription
            new_row_unsegmented['glosses'] = new_row['glosses']
            new_row_unsegmented['translation'] = line.translation
            data.append(new_row_unsegmented)
        return data
    
    data = parse_data(train_data, 'train') + parse_data(dev_data, 'dev') + parse_data(test_data, 'test')

    return datasets.Dataset.from_list(data)

st_data = {
    "arp": create_hf_dataset("../data/st_data/splits/Arapaho", "arp", "arap1274", "stan1293"),
    "git": create_hf_dataset("../data/st_data/splits/Gitksan", "git", "gitx1241", "stan1293"),
    "lez": create_hf_dataset("../data/st_data/splits/Lezgi", "lez", "lezg1247", "stan1293"),
    "nat": create_hf_dataset("../data/st_data/splits/Natugu", "ntu", "natu1246", "stan1293"),
    "nyb": create_hf_dataset("../data/st_data/splits/Nyangbo", "nyb", "nyan1302", "stan1293"),
    "ddo": create_hf_dataset("../data/st_data/splits/Tsez", "ddo", "dido1241", "stan1293"),
    "usp": create_hf_dataset("../data/st_data/splits/Uspanteko", "usp", "uspa1245", "stan1288"),
}

Loading ../data/st_data/splits/Arapaho
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ../data/st_data/splits/Gitksan
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ../data/st_data/splits/Lezgi
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ../data/st_data/splits/Natugu
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ../data/st_data/splits/Nyangbo
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ../data/st_data/splits/Tsez
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines
Loading ../data/st_data/splits/Uspanteko
Skipped 0 lines
Skipped 0 lines
Skipped 0 lines


In [4]:
st_data

{'arp': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'id', 'transcription', 'glosses', 'translation'],
     num_rows: 97832
 }),
 'git': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'id', 'transcription', 'glosses', 'translation'],
     num_rows: 220
 }),
 'lez': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'id', 'transcription', 'glosses', 'translation'],
     num_rows: 1752
 }),
 'nat': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'id', 'transcription', 'glosses', 'translation'],
     num_rows: 1978
 }),
 'nyb': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'id', 'transcription', 'glosses', 'translation'],
     num_rows: 5252
 }),
 'ddo': Dataset({
     features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'id', 'transcription', 'glosses', 'translat

In [9]:
# combined = datasets.concatenate_datasets([glosslm['train']] + list(st_data.values()))
# combined.push_to_hub("lecslab/glosslm")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Updating downloaded metadata with the new split.


In [5]:
import pandas as pd

# Replace data on hf

st_data_rows = datasets.concatenate_datasets(list(st_data.values())).to_pandas()
st_data_rows['transcription'] = st_data_rows['transcription'] \
            .str.replace('\t', ' ') \
            .str.replace(r"(\w)\?", r"\1 ?", regex=True) \
            .str.replace(r"(\w)\.", r"\1 .", regex=True) \
            .str.replace(r"(\w)\!", r"\1 !", regex=True) \
            .str.replace(r"(\w)\,", r"\1 ,", regex=True) \
            .str.replace("\-(\s|$)", " ", regex=True)

st_data_rows['glosses'] = st_data_rows['glosses'] \
            .str.replace("\t", " ") \
            .str.replace("\-(\s|$)", " ", regex=True) \
            .str.replace(r"(\w)\.(\s|$)", r"\1 . ", regex=True) \
            .str.replace(r"(\w)\!(\s|$)", r"\1 ! ", regex=True) \
            .str.replace(r"(\w)\?(\s|$)", r"\1 ? ", regex=True)
st_data_rows

Unnamed: 0,glottocode,metalang_glottocode,is_segmented,source,id,transcription,glosses,translation
0,arap1274,stan1293,yes,sigmorphon_st,st_train_arap1274_0,wootii niiyou heesi-ini hee3ohwoo-ni3 'oh hih-...,like here.it.is what-DETACH how.s.o..is.dancin...,"I guess the way he was dancing , they had neve..."
1,arap1274,stan1293,no,sigmorphon_st,st_train_arap1274_0,wootii niiyou heesiini hee3ohwooni3 'oh hih'ow...,like here.it.is what-DETACH how.s.o..is.dancin...,"I guess the way he was dancing , they had neve..."
2,arap1274,stan1293,yes,sigmorphon_st,st_train_arap1274_1,'oh siiyeih hiiwoonhehe' hoowooh-'uni,but INTENSE now no.longer-DETACH,But today that's really gone .
3,arap1274,stan1293,no,sigmorphon_st,st_train_arap1274_1,'oh siiyeih hiiwoonhehe' hoowooh'uni,but INTENSE now no.longer-DETACH,But today that's really gone .
4,arap1274,stan1293,yes,sigmorphon_st,st_train_arap1274_2,nih-tonoun-owoo biii-no' noh hoote,PAST-use-1S plume-NA.PL and sinew,I used feathers and sinew .
...,...,...,...,...,...,...,...,...
137203,uspa1245,stan1288,no,sigmorphon_st,st_test_uspa1245_630,loke tren jun kristyan re jun kristyan .,loque INC-E3S-hacer uno persona PART uno persona,Loque hace una persona a una otra persona.
137204,uspa1245,stan1288,yes,sigmorphon_st,st_test_uspa1245_631,syempr ti-j-toj na loke t-r-en re,siempre INC-E3S-pagar PART loque INC-E3S-hacer...,Siempre tiene que pagar loque uno hace.
137205,uspa1245,stan1288,no,sigmorphon_st,st_test_uspa1245_631,Syempr tijtoj na loke tren re .,siempre INC-E3S-pagar PART loque INC-E3S-hacer...,Siempre tiene que pagar loque uno hace.
137206,uspa1245,stan1288,yes,sigmorphon_st,st_test_uspa1245_632,ri' li t-an-b'ij,DEM DEM INC-E1S-decir,Eso es lo que digo.


In [6]:
old_data = glosslm['train'].to_pandas()
old_data = old_data[old_data['source'] != "sigmorphon_st"]
a = pd.concat([old_data, st_data_rows])
ds = datasets.Dataset.from_pandas(a).remove_columns(["__index_level_0__"])
ds

Dataset({
    features: ['transcription', 'glosses', 'translation', 'glottocode', 'id', 'source', 'metalang_glottocode', 'is_segmented'],
    num_rows: 451108
})

In [7]:
ds.push_to_hub("lecslab/glosslm", commit_message='Add ST data')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/452 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/558 [00:00<?, ?B/s]