In [7]:
from bs4 import BeautifulSoup
import pandas as pd

def convert_html_to_igt(filename):
    # Initialize lists to store extracted data with more robust filtering
    robust_transcriptions = []
    robust_glosses = []
    robust_translations = []

    # Read the HTML file
    with open(filename, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all tables and corresponding glosses
    tables = soup.find_all('table')
    gloss_divs = soup.find_all('div', {'class': 'gloss'})

    # More robustly filter out gloss divs that do not contain actual translations
    robust_gloss_divs = [div for div in gloss_divs if div.find('p') and 'style' in div.find('p').attrs and 'color' in div.find('p')['style']]

    # Iterate through each table and robustly filtered gloss div
    for table, gloss_div in zip(tables, robust_gloss_divs):
        # Extract rows from the table
        rows = table.find_all('tr')

        # Extract the transcription and gloss from the table, if they exist
        if len(rows) >= 2:
            transcription_cells = rows[0].find_all('td')
            
            # Delete tooltips
            for tooltip in rows[0].find_all("span", {'class':'tooltip'}): 
                tooltip.decompose()
            
            gloss_cells = rows[1].find_all('td')

            transcription = ' '.join(cell.text.strip() for cell in transcription_cells)
            gloss = ' '.join(cell.text.strip() for cell in gloss_cells)
        else:
            transcription = ''
            gloss = ''

        # Extract the translation from the gloss div
        translation = gloss_div.find('p').text.strip()

        # Append to the lists
        robust_transcriptions.append(transcription)
        robust_glosses.append(gloss)
        robust_translations.append(translation)
    
    with open(filename[:-4] + 'txt', 'w') as f:
        for (transc, gloss, transl) in zip(robust_transcriptions, robust_glosses, robust_translations):
            f.write('\n\n\\t ' + transc)
            f.write('\n\n\\m ' + transc)
            f.write('\n\\g ' + gloss)
            f.write('\n\\l ' + transl)

convert_html_to_igt(f'./Guarani Corpus/Story1.html')

In [23]:
for i in range(1, 16):
    convert_html_to_igt(f'./Guarani Corpus/Story{i}.html')

In [2]:
import os

# Fix segmentation/transcription lines

for file in os.listdir('./Guarani Corpus'):
    if file.endswith(".txt"):
        with open(os.path.join('./Guarani Corpus', file), 'r') as f:
            new_lines = []
            for line in f:
                line_prefix = line[:2]
                if line_prefix == '\\t':
                    transr = line[3:]
                    transr_unseg = transr.replace("-", "")
                    new_lines.append("\\t " + transr_unseg)
                    new_lines.append("\\m " + transr)
                else:
                    new_lines.append(line)
                    
            # Write fixed file
            with open(os.path.join('./Guarani Corpus', file[:-4] + '-fixed.txt'), 'w') as wf:
                wf.write("".join(new_lines))

In [10]:
from st_data.data import IGTLine, create_hf_dataset

guarani_data = create_hf_dataset("./Guarani Corpus/data-fixed", "para1311", "stan1293", row_id="guarani")
guarani_data

  from .autonotebook import tqdm as notebook_tqdm


Loading ./Guarani Corpus/data-fixed
Story3-fixed.txt
Looks good
Story12-fixed.txt
Looks good
Story8-fixed.txt
Looks good
Story14-fixed.txt
Looks good
Story5-fixed.txt
Looks good
Story2-fixed.txt
Looks good
Story13-fixed.txt
Looks good
Story9-fixed.txt
Looks good
Story15-fixed.txt
Looks good
Story4-fixed.txt
Looks good
Story1-fixed.txt
Looks good
Story10-fixed.txt
Looks good
Story7-fixed.txt
Looks good
Story11-fixed.txt
Looks good
Story6-fixed.txt
Looks good


Dataset({
    features: ['glottocode', 'metalang_glottocode', 'is_segmented', 'source', 'type', 'ID', 'transcription', 'glosses', 'translation'],
    num_rows: 1606
})

In [11]:
def tokenize_punc(text):
    return text.replace('\t', ' ') \
            .replace("?", " ?") \
            .replace(".", " .") \
            .replace("!", " !") \
            .replace(",", " ,") 

tokenize_punc("Oguahẽ ndaje ka’i karai jaguarete róga-pe, o-jerure haguã posáda.")

'Oguahẽ ndaje ka’i karai jaguarete róga-pe , o-jerure haguã posáda .'

In [12]:
def fix_data(row):
    row["transcription"] = tokenize_punc(row["transcription"])
    row["source"] = "guarani"
    return row

guarani_data = guarani_data.map(fix_data)

Map: 100%|██████████| 1606/1606 [00:00<00:00, 20516.60 examples/s]


In [6]:
guarani_data.push_to_hub("lecslab/guarani")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/744 [00:00<?, ?B/s]

In [17]:
import datasets

# run this cell to replace the data on HF
old_data = datasets.load_dataset("lecslab/glosslm", download_mode='force_redownload')['train']

combined = datasets.concatenate_datasets([old_data, guarani_data])

combined.push_to_hub("lecslab/glosslm")

Downloading readme: 100%|██████████| 745/745 [00:00<00:00, 1.69MB/s]
Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|          | 0.00/27.9M [00:00<?, ?B/s][A
Downloading data:  15%|█▌        | 4.19M/27.9M [00:00<00:03, 6.21MB/s][A
Downloading data:  45%|████▌     | 12.6M/27.9M [00:01<00:01, 10.9MB/s][A
Downloading data:  75%|███████▌  | 21.0M/27.9M [00:01<00:00, 12.2MB/s][A
Downloading data: 100%|██████████| 27.9M/27.9M [00:02<00:00, 13.1MB/s][A
Downloading data files: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 343.29it/s]
Generating train split: 100%|██████████| 423414/423414 [00:00<00:00, 2460093.91 examples/s]
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|          | 0/426 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:  31%|███       | 133/426 [00:00<00:00, 1328.82ba/s][A
Creating parquet from Arrow format:  6

In [19]:
combined

Dataset({
    features: ['ID', 'glottocode', 'transcription', 'glosses', 'translation', 'metalang_glottocode', 'is_segmented', 'source', 'type'],
    num_rows: 425020
})

In [20]:
old_data

Dataset({
    features: ['ID', 'glottocode', 'transcription', 'glosses', 'translation', 'metalang_glottocode', 'is_segmented', 'source', 'type'],
    num_rows: 423414
})