# Mixtec data prep

Nay San and Antón de la Fuente

This notebook is an analysis of San Martín Duraznos Mixtec recordings in preparation for ASR experiments with the fieldwork collection, compiled by Sandra Auderset and Carmen Hernández Martínez (see ELAR collection: https://www.elararchive.org/dk0629).

## Raw data

The raw data is not publicy available so project collaborators should use `gdown` to download and extract the raw corpus into `data/_raw` using the following commands (replace the `1VD***************************QOJ` with the real file id):

```bash
# Download tar into tmp, Create data/_raw/mixtec, and Extract data
gdown 1VD***************************QOJ -O tmp/ && \
mkdir -p data/_raw/mixtec && \
tar -xvzf tmp/20230703_mixtec-raw-corpus.tgz -C data/_raw/mixtec
```

## Import data for analyses

In [1]:
from pathlib import Path

corpus   = Path('data/_raw/mixtec/corpus/')
all_wavs = list(corpus.glob('**/*.wav'))

all_wavs[:10]

[PosixPath('data/_raw/mixtec/corpus/SMD-0087-Casa/SMD-0087-Casa.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0106-Cronica/SMD-0106-Cronica.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0090-Vesicula/SMD-0090-Vesicula.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0082-Charla/SMD-0082-Charla.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0057-Tierra/SMD-0057-Tierra.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0011-MiHistoria/SMD-0011-MiHistoria.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0006-Camino/SMD-0006-Camino.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0078-Rufina/SMD-0078-Rufina.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0017-Espacial/SMD-0017-Espacial.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0020-Huauzontle/SMD-0020-Huauzontle.wav')]

In [2]:
# Keep only wav files with corresponding eaf files (e.g. A.wav + A.eaf)
wavs_with_eafs = [ p for p in all_wavs if p.with_suffix('.eaf').exists() ]

wavs_with_eafs[:10]

[PosixPath('data/_raw/mixtec/corpus/SMD-0087-Casa/SMD-0087-Casa.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0106-Cronica/SMD-0106-Cronica.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0090-Vesicula/SMD-0090-Vesicula.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0082-Charla/SMD-0082-Charla.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0057-Tierra/SMD-0057-Tierra.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0011-MiHistoria/SMD-0011-MiHistoria.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0006-Camino/SMD-0006-Camino.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0078-Rufina/SMD-0078-Rufina.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0017-Espacial/SMD-0017-Espacial.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0020-Huauzontle/SMD-0020-Huauzontle.wav')]

In [3]:
# The folder 'Metadata' contains copies of various files so we
# only want to keep files NOT in that folder for analysis
non_dupe_wavs = [ p for p in wavs_with_eafs if p.parts[1] != 'Metadata' ]

non_dupe_wavs[:10]

[PosixPath('data/_raw/mixtec/corpus/SMD-0087-Casa/SMD-0087-Casa.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0106-Cronica/SMD-0106-Cronica.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0090-Vesicula/SMD-0090-Vesicula.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0082-Charla/SMD-0082-Charla.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0057-Tierra/SMD-0057-Tierra.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0011-MiHistoria/SMD-0011-MiHistoria.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0006-Camino/SMD-0006-Camino.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0078-Rufina/SMD-0078-Rufina.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0017-Espacial/SMD-0017-Espacial.wav'),
 PosixPath('data/_raw/mixtec/corpus/SMD-0020-Huauzontle/SMD-0020-Huauzontle.wav')]

In [4]:
import re
import pandas as pd
from pympi import Eaf

from helpers.io import ms_to_hms

all_eafs = [ p.with_suffix('.eaf') for p in non_dupe_wavs ]
eaf_data = [ (p, Eaf(p)) for p in all_eafs ]
transcription_regex = re.compile("(Transcription@)[A-Z\d]+")
data_list = list()

for p, annotation in eaf_data:

    for tier in filter(transcription_regex.match, annotation.get_tier_names()):

        wav_path = p.with_suffix('.wav')
        file_annotations = [(wav_path.parent, wav_path.name, f'{wav_path.stem}_{ms_to_hms(start)}.wav', tier, start, end, text) for start, end, text in annotation.get_annotation_data_for_tier(tier)]
        data_list.extend(file_annotations)

raw_data = pd.DataFrame(data_list, columns=['path', 'file', 'annotation_id', 'tier', 'start_ms', 'end_ms', 'raw_text'])

raw_data

Unnamed: 0,path,file,annotation_id,tier,start_ms,end_ms,raw_text
0,data/_raw/mixtec/corpus/SMD-0106-Cronica,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m00s020.wav,Transcription@PPM,20,4820,"chaa Sandra, chaa Carmen"
1,data/_raw/mixtec/corpus/SMD-0106-Cronica,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m04s820.wav,Transcription@PPM,4820,9730,"koña'a ña Carmen iin, xinteini-nto ikixa-nto ñ..."
2,data/_raw/mixtec/corpus/SMD-0106-Cronica,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m10s050.wav,Transcription@PPM,10050,13730,San Martin Duraznos ka'-nti ji'i-an tu'un stila
3,data/_raw/mixtec/corpus/SMD-0106-Cronica,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m13s730.wav,Transcription@PPM,13730,18230,tu'un te'ei' ka'-nti ji'i-an samatxi nche'e
4,data/_raw/mixtec/corpus/SMD-0106-Cronica,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m18s735.wav,Transcription@PPM,18735,23495,chinteini-nto kixa-nto ñuu yo'o sa'a-nto ña ki...
...,...,...,...,...,...,...,...
18431,data/_raw/mixtec/corpus/SMD-0055-Plantas,SMD-0055-Plantas.wav,SMD-0055-Plantas_00h05m34s912.wav,Transcription@CMR,334912,337509,ña'a katuan yo'o tuya'a kii to'o
18432,data/_raw/mixtec/corpus/SMD-0055-Plantas,SMD-0055-Plantas.wav,SMD-0055-Plantas_00h05m37s509.wav,Transcription@CMR,337509,340053,sivi kii ntivi va to'o
18433,data/_raw/mixtec/corpus/SMD-0055-Plantas,SMD-0055-Plantas.wav,SMD-0055-Plantas_00h05m40s064.wav,Transcription@CMR,340064,350064,ta ña yo'o ki yiki ntxikoo ka'ana ji'an siva k...
18434,data/_raw/mixtec/corpus/SMD-0055-Plantas,SMD-0055-Plantas.wav,SMD-0055-Plantas_00h05m50s064.wav,Transcription@CMR,350064,354276,ñakan va chun ña yo'o ñakan sa'a ña yo'o


### Show/explore raw character set

Use `CharSetExplorer` to iteratively/interactively explore the text data

In [19]:
from helpers.vocab import CharSetExplorer

raw_charset = CharSetExplorer(raw_data, 'raw_text', 'annotation_id')

100%|█████████████████████████████████████████████████████| 18436/18436 [00:00<00:00, 433498.65it/s]


Show full data frame with counts of each character

In [6]:
raw_charset.raw_chars_df

Unnamed: 0,id,text,c,h,a,Unnamed: 6,S,n,d,r,...,’,H,R,;,Y,Ñ,Q,\n,=,ǎ
0,SMD-0106-Cronica_00h00m00s020.wav,"chaa Sandra, chaa Carmen",2,2,7,5,1,2,1,2,...,0,0,0,0,0,0,0,0,0,0
1,SMD-0106-Cronica_00h00m04s820.wav,"koña'a ña Carmen iin, xinteini-nto ikixa-nto ñ...",1,1,8,10,0,6,0,1,...,0,0,0,0,0,0,0,0,0,0
2,SMD-0106-Cronica_00h00m10s050.wav,San Martin Duraznos ka'-nti ji'i-an tu'un stila,0,0,6,7,1,6,0,2,...,0,0,0,0,0,0,0,0,0,0
3,SMD-0106-Cronica_00h00m13s730.wav,tu'un te'ei' ka'-nti ji'i-an samatxi nche'e,1,1,4,6,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SMD-0106-Cronica_00h00m18s735.wav,chinteini-nto kixa-nto ñuu yo'o sa'a-nto ña ki...,1,1,4,9,0,6,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18431,SMD-0055-Plantas_00h05m34s912.wav,ña'a katuan yo'o tuya'a kii to'o,0,0,6,5,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
18432,SMD-0055-Plantas_00h05m37s509.wav,sivi kii ntivi va to'o,0,0,1,4,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
18433,SMD-0055-Plantas_00h05m40s064.wav,ta ña yo'o ki yiki ntxikoo ka'ana ji'an siva k...,0,0,19,23,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
18434,SMD-0055-Plantas_00h05m50s064.wav,ñakan va chun ña yo'o ñakan sa'a ña yo'o,1,1,9,10,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0


Show character set with corresponding column number in `raw_chars_df` and total counts

In [7]:
# Without descriptions
raw_charset.print_charset()

There are 107 unique characters in the raw text (Column number of raw_chars_df: Character, Description):

(2: 'c', 7620), (3: 'h', 9791), (4: 'a', 98162), (5: ' ', 107962), (6: 'S', 150), (7: 'n', 61271), (8: 'd', 2171), (9: 'r', 7138), (10: ',', 1205), (11: 'C', 52), (12: 'm', 5388), (13: 'e', 11467), (14: 'k', 32952), (15: 'o', 26922), (16: 'ñ', 10433), (17: ''', 25341), (18: 'i', 75768), (19: 'x', 17292), (20: 't', 37076), (21: '-', 19608), (22: 'u', 23122), (23: 's', 14737), (24: 'M', 137), (25: 'D', 32), (26: 'z', 146), (27: 'j', 4589), (28: 'l', 4043), (29: 'y', 6085), (30: 'f', 283), (31: 'v', 14972), (32: 'J', 60), (33: 'p', 1220), (34: 'A', 20), (35: 'b', 461), (36: 'G', 42), (37: 'ú', 973), (38: 'T', 44), (39: '(', 110), (40: ')', 110), (41: 'P', 47), (42: 'q', 115), (43: 'g', 623), (44: 'I', 6), (45: 'E', 14), (46: 'B', 10), (47: 'Z', 6), (48: 'F', 5), (49: 'V', 23), (50: '.', 106), (51: 'í', 6286), (52: '¿', 2), (53: '?', 86), (54: 'N', 109), (55: 'ì', 5787), (56: 'ù', 1574

In [8]:
# With descriptions
raw_charset.print_charset(with_descriptions=True)

There are 107 unique characters in the raw text (Column number of raw_chars_df: Character, No. of occurrences):

(2: 'c', LATIN SMALL LETTER C), (3: 'h', LATIN SMALL LETTER H), (4: 'a', LATIN SMALL LETTER A), (5: ' ', SPACE), (6: 'S', LATIN CAPITAL LETTER S), (7: 'n', LATIN SMALL LETTER N), (8: 'd', LATIN SMALL LETTER D), (9: 'r', LATIN SMALL LETTER R), (10: ',', COMMA), (11: 'C', LATIN CAPITAL LETTER C), (12: 'm', LATIN SMALL LETTER M), (13: 'e', LATIN SMALL LETTER E), (14: 'k', LATIN SMALL LETTER K), (15: 'o', LATIN SMALL LETTER O), (16: 'ñ', LATIN SMALL LETTER N WITH TILDE), (17: ''', APOSTROPHE), (18: 'i', LATIN SMALL LETTER I), (19: 'x', LATIN SMALL LETTER X), (20: 't', LATIN SMALL LETTER T), (21: '-', HYPHEN-MINUS), (22: 'u', LATIN SMALL LETTER U), (23: 's', LATIN SMALL LETTER S), (24: 'M', LATIN CAPITAL LETTER M), (25: 'D', LATIN CAPITAL LETTER D), (26: 'z', LATIN SMALL LETTER Z), (27: 'j', LATIN SMALL LETTER J), (28: 'l', LATIN SMALL LETTER L), (29: 'y', LATIN SMALL LETTER Y), 

Show texts with certain characters (use column numbers of `raw_chars_df`):

In [9]:
raw_charset.show_texts_with_chars([63, 108])

Unnamed: 0,id,text,`,ǎ
399,SMD-0090-Vesicula_00h04m31s864.wav,koo sa opera-ò sáá opra-`tava-ná mii kava-vó saan,1,0
3312,SMD-0049-Medicinas1_00h00m12s440.wav,"lo´o cafe, lo`o cafe jì'in lo'o panela kee jì'...",1,0
3313,SMD-0049-Medicinas1_00h00m16s730.wav,kitxian taa ñakan iin vaso lo`o ñakan ko`on,2,0
5905,SMD-0021-Platica_00h06m12s945.wav,ùkivi ña kù'ùn-n`s chó'o chikáa,1,0
6797,SMD-0088-Matilde_00h22m53s182.wav,tíí ìkeva'a di-ó ta va'a ta `íí koo ìkeva'-ó s...,1,0
8216,SMD-0089-SuplenteB_00h18m21s681.wav,àján `jaán àjàn,1,0
8483,SMD-0089-SuplenteB_00h01m04s470.wav,ñà kivi ka`-ò jí'in-na ña án kíí kiì va saá-na...,1,0
9519,SMD-0077-Verduras_00h03m15s073.wav,ajaìn ña ìxà'a-`í nù-ì ña níi-ní ñà níi-ní ìki...,1,0
9909,SMD-0077-Verduras_00h20m21s046.wav,ntxè'é-`í chii xàà ìkixà-rì ñà ko'o-rí,1,0
13786,SMD-0049-Medicinas1_00h00m12s440.wav,"lo´o cafe, lo`o cafe jì'in lo'o panela kee jì'...",1,0


## Process texts for ASR

### Try to normalize to meaningful character set 

In [10]:
# Manually enter this part so you can icrementally build up the expected character set

# Space + Consonants + 5 vowels x 5 tones
expected_charset = set(' ' + "'bcdfghjklmnNpqrstvwxyz"). \
    union(set('aA@ç§' + 'eE#€¶' + 'iI®+©' + 'oO%ø>' + 'uU&¬<'))

print(sorted(list(expected_charset)))

[' ', '#', '%', '&', "'", '+', '<', '>', '@', 'A', 'E', 'I', 'N', 'O', 'U', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '§', '©', '¬', '®', '¶', 'ç', 'ø', '€']


In [11]:
from unicodedata import normalize
from functools import reduce

removals_regex = [
    r"[.,;:]",        # Remove punctuation
    r'"',             # Remove quotes
    r"[¡!]",          # Remove exlamations
    r"\t+",           # Remove tabs
    r'[$#@/\*&\^\+=]' # Remove other symbols
]

replacements_regex = {
    # Collapse char+modifier unicode sequences into a single unicode char
    'á': 'A', 'é': 'E', 'í': 'I', 'ó': 'O', 'ú': 'U',
    'à': '@', 'è': '#', 'ì': '®', 'ò': '%', 'ù': '&',
    'â': 'ç', 'ê': '€', 'î': '+', 'ô': 'ø', 'û': '¬',
    'ǎ': '§', 'ě': '¶', 'ǐ': '©', 'ǒ': '>', 'ǔ': '<',
    'ñ': 'N',
    # Map affixes and new lines into a space
    '-': ' ',
    '\n': ' '
 }

# Lower case raw_text and put in a new column 'norm_text' column
normed_data = raw_data.assign(norm_text = lambda df: df.raw_text.str.lower())
# NFKC = compose diacritics and characters
normed_data.norm_text = normed_data.norm_text.apply(lambda str: normalize('NFKC', str))

for k, v in [ (r, '') for r in removals_regex ] + list(replacements_regex.items()):
    normed_data.norm_text = normed_data.norm_text.str.replace(k, v, regex=True)

normed_data.norm_text

normed_data[['annotation_id', 'raw_text', 'norm_text']]

Unnamed: 0,annotation_id,raw_text,norm_text
0,SMD-0106-Cronica_00h00m00s020.wav,"chaa Sandra, chaa Carmen",chaa sandra chaa carmen
1,SMD-0106-Cronica_00h00m04s820.wav,"koña'a ña Carmen iin, xinteini-nto ikixa-nto ñ...",koNa'a Na carmen iin xinteini nto ikixa nto Nu...
2,SMD-0106-Cronica_00h00m10s050.wav,San Martin Duraznos ka'-nti ji'i-an tu'un stila,san martin duraznos ka' nti ji'i an tu'un stila
3,SMD-0106-Cronica_00h00m13s730.wav,tu'un te'ei' ka'-nti ji'i-an samatxi nche'e,tu'un te'ei' ka' nti ji'i an samatxi nche'e
4,SMD-0106-Cronica_00h00m18s735.wav,chinteini-nto kixa-nto ñuu yo'o sa'a-nto ña ki...,chinteini nto kixa nto Nuu yo'o sa'a nto Na ki...
...,...,...,...
18431,SMD-0055-Plantas_00h05m34s912.wav,ña'a katuan yo'o tuya'a kii to'o,Na'a katuan yo'o tuya'a kii to'o
18432,SMD-0055-Plantas_00h05m37s509.wav,sivi kii ntivi va to'o,sivi kii ntivi va to'o
18433,SMD-0055-Plantas_00h05m40s064.wav,ta ña yo'o ki yiki ntxikoo ka'ana ji'an siva k...,ta Na yo'o ki yiki ntxikoo ka'ana ji'an siva k...
18434,SMD-0055-Plantas_00h05m50s064.wav,ñakan va chun ña yo'o ñakan sa'a ña yo'o,Nakan va chun Na yo'o Nakan sa'a Na yo'o


In [12]:
normed_charset = CharSetExplorer(normed_data, 'norm_text', 'annotation_id')

100%|█████████████████████████████████████████████████████| 18436/18436 [00:00<00:00, 469825.25it/s]


In [13]:
normed_charset.print_charset(expected_charset=expected_charset)

There are 69 unique characters in the raw text (Column number of raw_chars_df: Character, Description):

(2: 'c', 7672), (3: 'h', 9799), (4: 'a', 94921), (5: ' ', 128118), (6: 's', 14887), (7: 'n', 61317), (8: 'd', 2203), (9: 'r', 7148), (10: 'm', 5525), (11: 'e', 11250), (12: 'k', 32976), (13: 'o', 25665), (14: 'N', 10497), (15: ''', 25341), (16: 'i', 72748), (17: 'x', 17299), (18: 't', 37120), (19: 'u', 22575), (20: 'z', 152), (21: 'j', 4649), (22: 'l', 4063), (23: 'y', 6088), (24: 'f', 288), (25: 'v', 14995), (26: 'p', 1267), (27: 'b', 471), (28: 'g', 665), (29: 'U', 1140), (30: '(', 110), (31: ')', 110), (32: 'q', 116), (33: 'I', 7740), (34: '¿', 2), (35: '?', 86), (36: '®', 7341), (37: '&', 1955), (38: 'E', 1029), (39: 'A', 6432), (40: '@', 9166), (41: '#', 385), (42: 'O', 2769), (43: '%', 2134), (44: '`', 14), (45: '́', 16), (46: '[', 70), (47: ']', 70), (48: '+', 13), (49: '1', 47), (50: '2', 34), (51: '3', 20), (52: '4', 25), (53: '5', 19), (54: '6', 30), (55: '7', 13), (56: '8

In [14]:
normed_charset.show_texts_with_chars([45])

Unnamed: 0,id,text,́
586,SMD-0082-Charla_00h01m39s914.wav,ta yoo vuelta inka nuu va tu'un ka'an Na kaa t...,1
3312,SMD-0049-Medicinas1_00h00m12s440.wav,lo ́o cafe lo`o cafe j®'in lo'o panela kee j®'...,1
3318,SMD-0049-Medicinas1_00h00m26s720.wav,ita peric%n Na ita li®co kachi% j®'an ita l®i...,1
3728,SMD-0046-Pollo_00h08m55s390.wav,nishi kuni mi O koo r® an txi kasun kII r ́I a...,1
7251,SMD-0088-Matilde_00h04m58s829.wav,va'a ta na ku'un va ® na ntxo'o ke ® jI'i na v...,1
8148,SMD-0022-Salsa_00h00m47s820.wav,ta saka nÓ N@,1
8550,SMD-0089-SuplenteB_00h08m16s105.wav,ko'o kotonini'ni O n&& na ntoo ch ́kAn va ka'a...,1
8715,SMD-0089-SuplenteB_00h23m57s737.wav,tA ́@ n pablo ikAn chii yoo kii ntak@@ mII @n kaa,1
9839,SMD-0077-Verduras_00h18m02s844.wav,k%O sOs% n ́nt® itxa Na v#e x®'®,1
10873,SMD-0091-Diablo_00h01m23s560.wav,tAa ®k®'vi @n ta ini mIi ́y@vi kAan tat&un Iin...,1


### Remove utterances that cannot be addressed by normalization process

In [15]:
# Fill this section in iteratively
removal_criteria = {
    # Anything with questions marks and enclosures appear to be where transcribers are unsure of the content
    'one_question': r'\?',
    'multiple_questions': r'\?{2,}',
    'has_enclosures':r'[\(\[\{\)\]\}]',
    # Digits aren't spelled out for ASR (e.g. 33 = thirty three) but there doesn't appear to be too many of them
    'has_digits': r'[\d]',
    # Some accents could not be addresed by the normalization process above
    'has_stray_accents': r'`|´|’| ̀|x̀| ́|Ó'
}

removal_candidates = dict([ (name, normed_data.norm_text.str.contains(pat=p, regex=True)) for (name, p) in removal_criteria.items() ])
total_removal_cands = 0

for removal_reason, removal_cands in removal_candidates.items():
    num_removal_cands = removal_cands.sum()
    total_removal_cands += num_removal_cands
    print(f"There are {num_removal_cands} rows satisfying removal reason: '{removal_reason}'")

print(f"\nTotal removal candidates: {total_removal_cands} = {total_removal_cands/len(normed_data) * 100:.2f}% of total data.")

There are 74 rows satisfying removal reason: 'one_question'
There are 7 rows satisfying removal reason: 'multiple_questions'
There are 197 rows satisfying removal reason: 'has_enclosures'
There are 109 rows satisfying removal reason: 'has_digits'
There are 32 rows satisfying removal reason: 'has_stray_accents'

Total removal candidates: 419 = 2.27% of total data.


In [16]:
final_data = normed_data[~pd.concat(removal_candidates.values(), axis=1).any(axis=1)]

final_data

Unnamed: 0,path,file,annotation_id,tier,start_ms,end_ms,raw_text,norm_text
0,data/_raw/mixtec/corpus/SMD-0106-Cronica,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m00s020.wav,Transcription@PPM,20,4820,"chaa Sandra, chaa Carmen",chaa sandra chaa carmen
1,data/_raw/mixtec/corpus/SMD-0106-Cronica,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m04s820.wav,Transcription@PPM,4820,9730,"koña'a ña Carmen iin, xinteini-nto ikixa-nto ñ...",koNa'a Na carmen iin xinteini nto ikixa nto Nu...
2,data/_raw/mixtec/corpus/SMD-0106-Cronica,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m10s050.wav,Transcription@PPM,10050,13730,San Martin Duraznos ka'-nti ji'i-an tu'un stila,san martin duraznos ka' nti ji'i an tu'un stila
3,data/_raw/mixtec/corpus/SMD-0106-Cronica,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m13s730.wav,Transcription@PPM,13730,18230,tu'un te'ei' ka'-nti ji'i-an samatxi nche'e,tu'un te'ei' ka' nti ji'i an samatxi nche'e
4,data/_raw/mixtec/corpus/SMD-0106-Cronica,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m18s735.wav,Transcription@PPM,18735,23495,chinteini-nto kixa-nto ñuu yo'o sa'a-nto ña ki...,chinteini nto kixa nto Nuu yo'o sa'a nto Na ki...
...,...,...,...,...,...,...,...,...
18431,data/_raw/mixtec/corpus/SMD-0055-Plantas,SMD-0055-Plantas.wav,SMD-0055-Plantas_00h05m34s912.wav,Transcription@CMR,334912,337509,ña'a katuan yo'o tuya'a kii to'o,Na'a katuan yo'o tuya'a kii to'o
18432,data/_raw/mixtec/corpus/SMD-0055-Plantas,SMD-0055-Plantas.wav,SMD-0055-Plantas_00h05m37s509.wav,Transcription@CMR,337509,340053,sivi kii ntivi va to'o,sivi kii ntivi va to'o
18433,data/_raw/mixtec/corpus/SMD-0055-Plantas,SMD-0055-Plantas.wav,SMD-0055-Plantas_00h05m40s064.wav,Transcription@CMR,340064,350064,ta ña yo'o ki yiki ntxikoo ka'ana ji'an siva k...,ta Na yo'o ki yiki ntxikoo ka'ana ji'an siva k...
18434,data/_raw/mixtec/corpus/SMD-0055-Plantas,SMD-0055-Plantas.wav,SMD-0055-Plantas_00h05m50s064.wav,Transcription@CMR,350064,354276,ñakan va chun ña yo'o ñakan sa'a ña yo'o,Nakan va chun Na yo'o Nakan sa'a Na yo'o


### Processed dataset

Expect unexpected character set to be empty!

In [17]:
final_charset = CharSetExplorer(final_data, 'norm_text', 'annotation_id')

100%|█████████████████████████████████████████████████████| 18027/18027 [00:00<00:00, 209758.25it/s]


In [18]:
final_charset.print_charset(expected_charset=expected_charset)

There are 47 unique characters in the raw text (Column number of raw_chars_df: Character, Description):

(2: 'c', 7487), (3: 'h', 9591), (4: 'a', 92917), (5: ' ', 124867), (6: 's', 14477), (7: 'n', 59913), (8: 'd', 2146), (9: 'r', 6912), (10: 'm', 5363), (11: 'e', 10894), (12: 'k', 32169), (13: 'o', 25102), (14: 'N', 10194), (15: ''', 24765), (16: 'i', 71327), (17: 'x', 16898), (18: 't', 36219), (19: 'u', 22127), (20: 'z', 146), (21: 'j', 4548), (22: 'l', 3942), (23: 'y', 5943), (24: 'f', 270), (25: 'v', 14732), (26: 'p', 1220), (27: 'b', 453), (28: 'g', 646), (29: 'U', 1063), (30: 'q', 107), (31: 'I', 7424), (32: '®', 7050), (33: '&', 1850), (34: 'E', 996), (35: 'A', 6190), (36: '@', 8755), (37: '#', 364), (38: 'O', 2628), (39: '%', 2049), (40: '+', 13), (41: '§', 14), (42: 'w', 5), (43: '©', 6), (44: 'ç', 15), (45: '<', 3), (46: '€', 2), (47: 'ø', 10), (48: '>', 3)


The following do not appear in the expected charset:

