# Mixtec data prep

Nay San and Antón de la Fuente

This notebook is an analysis of San Martín Duraznos Mixtec recordings in preparation for ASR experiments with the fieldwork collection, compiled by Sandra Auderset and Carmen Hernández Martínez (see ELAR collection: https://www.elararchive.org/dk0629).

## Raw data

The raw data is not publicy available so project collaborators should use `gdown` to download and extract the raw corpus into `data/_raw` using the following commands (replace the `1VD***************************QOJ` with the real file id):

```bash
# Download tar into tmp, Create data/_raw/mixtec, and Extract data (and remove duplicates in Metadata folder)
gdown 1VD***************************QOJ -O tmp/ && \
mkdir -p data/_raw/mixtec && \
tar -xvzf tmp/20230703_mixtec-raw-corpus.tgz -C data/_raw/mixtec && \
rm -r data/_raw/mixtec/corpus/Metadata/
```

## Import data for analyses

In [9]:
from pathlib import Path

#corpus   = Path('data/_raw/mixtec/corpus/')
corpus = Path('/users/anton/PycharmProjects/mixtec_analysis/corpus')
all_wavs = list(corpus.glob('**/*.wav'))

all_wavs[:10]

[PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0106-Cronica/SMD-0106-Cronica.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0046-Pollo/SMD-0046-Pollo.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0102-Navidad/SMD-0102-Navidad.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0101-Pomada/SMD-0101-Pomada.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0065-Verbos/SMD-0065-Verbos.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0029-Colores/SMD-0029-Colores.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0060-Cuenta/SMD-0060-Cuenta.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0087-Casa/SMD-0087-Casa.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0098-Elena/SMD-0098-Elena.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0009-Ja

In [10]:
# Keep only wav files with corresponding eaf files (e.g. A.wav + A.eaf)
wavs_with_eafs = [ p for p in all_wavs if p.with_suffix('.eaf').exists() ]

wavs_with_eafs[:10]

[PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0106-Cronica/SMD-0106-Cronica.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0046-Pollo/SMD-0046-Pollo.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0102-Navidad/SMD-0102-Navidad.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0101-Pomada/SMD-0101-Pomada.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0065-Verbos/SMD-0065-Verbos.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0029-Colores/SMD-0029-Colores.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0060-Cuenta/SMD-0060-Cuenta.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0087-Casa/SMD-0087-Casa.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0098-Elena/SMD-0098-Elena.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0009-Ja

In [11]:
# The folder 'Metadata' contains copies of various files so we
# only want to keep files NOT in that folder for analysis
non_dupe_wavs = [ p for p in wavs_with_eafs if 'Metadata' not in p.parts  ]

non_dupe_wavs[:10]

[PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0106-Cronica/SMD-0106-Cronica.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0046-Pollo/SMD-0046-Pollo.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0102-Navidad/SMD-0102-Navidad.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0101-Pomada/SMD-0101-Pomada.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0065-Verbos/SMD-0065-Verbos.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0029-Colores/SMD-0029-Colores.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0060-Cuenta/SMD-0060-Cuenta.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0087-Casa/SMD-0087-Casa.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0098-Elena/SMD-0098-Elena.wav'),
 PosixPath('/users/anton/PycharmProjects/mixtec_analysis/corpus/SMD-0009-Ja

In [12]:
import re
import pandas as pd
from pympi import Eaf

from helpers.io import ms_to_hms

all_eafs = [ p.with_suffix('.eaf') for p in non_dupe_wavs ]
eaf_data = [ (p, Eaf(p)) for p in all_eafs ]
transcription_regex = re.compile("(Transcription@)[A-Z\d]+")
data_list = list()

for p, annotation in eaf_data:

    # Handle some exceptions
    if "OldVersions" in p.parts:
        # Don't want to process duplicated files
        continue

    tiers_to_process = list(filter(transcription_regex.match, annotation.get_tier_names()))

    if p.name == "SMD-0009-Jardin.eaf":
        # Type in tier name in raw eaf data
        tiers_to_process.append("Tanscription@CHM")

    for tier in tiers_to_process:

        wav_path = p.with_suffix('.wav')
        file_annotations = [(wav_path.parent, wav_path.name, f'{wav_path.stem}_{ms_to_hms(start)}.wav', tier, start, end, text) for start, end, text in annotation.get_annotation_data_for_tier(tier)]
        data_list.extend(file_annotations)

raw_data = pd.DataFrame(data_list, columns=['path', 'file', 'annotation_id', 'tier', 'start_ms', 'end_ms', 'raw_text'])

# Remove exactly overlapping speech across tiers
raw_data = raw_data.drop_duplicates(['file', 'start_ms'])

# Remove annotations with no or little text
raw_data = raw_data[raw_data.raw_text.str.replace("[\s|\t]*", "", regex=True).str.len() > 3]

raw_data

Unnamed: 0,path,file,annotation_id,tier,start_ms,end_ms,raw_text
0,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m00s020.wav,Transcription@PPM,20,4820,"chaa Sandra, chaa Carmen"
1,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m04s820.wav,Transcription@PPM,4820,9730,"koña'a ña Carmen iin, xinteini-nto ikixa-nto ñ..."
2,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m10s050.wav,Transcription@PPM,10050,13730,San Martin Duraznos ka'-nti ji'i-an tu'un stila
3,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m13s730.wav,Transcription@PPM,13730,18230,tu'un te'ei' ka'-nti ji'i-an samatxi nche'e
4,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m18s735.wav,Transcription@PPM,18735,23495,chinteini-nto kixa-nto ñuu yo'o sa'a-nto ña ki...
...,...,...,...,...,...,...,...
12790,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0027-Tablita.wav,SMD-0027-Tablita_00h00m13s815.wav,Transcription@CHM,13815,15375,ìñù jí'in ìvì kíí ùnà
12791,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0027-Tablita.wav,SMD-0027-Tablita_00h00m15s440.wav,Transcription@CHM,15440,17240,ùnà jí'in ùnà kíí xà'ùn iin
12792,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0027-Tablita.wav,SMD-0027-Tablita_00h00m17s775.wav,Transcription@CHM,17775,20855,ntava vìtì lo'o chìì xàà ìntava va yí'ì
12793,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0027-Tablita.wav,SMD-0027-Tablita_00h00m21s280.wav,Transcription@CHM,21280,23110,ntava tiki ìnkana'a


### Show/explore raw character set

Use `CharSetExplorer` to iteratively/interactively explore the text data

In [13]:
from helpers.vocab import CharSetExplorer

raw_charset = CharSetExplorer(raw_data, 'raw_text', 'annotation_id')

  0%|                                                                     | 0/11693 [00:00<?, ?it/s]

Show full data frame with counts of each character

In [14]:
raw_charset.raw_chars_df

Unnamed: 0,id,text,c,h,a,Unnamed: 6,S,n,d,r,...,*,̂,w,\t,¡,`,Q,3,ǎ,Ñ
0,SMD-0106-Cronica_00h00m00s020.wav,"chaa Sandra, chaa Carmen",2,2,7,5,1,2,1,2,...,0,0,0,0,0,0,0,0,0,0
1,SMD-0106-Cronica_00h00m04s820.wav,"koña'a ña Carmen iin, xinteini-nto ikixa-nto ñ...",1,1,8,10,0,6,0,1,...,0,0,0,0,0,0,0,0,0,0
2,SMD-0106-Cronica_00h00m10s050.wav,San Martin Duraznos ka'-nti ji'i-an tu'un stila,0,0,6,7,1,6,0,2,...,0,0,0,0,0,0,0,0,0,0
3,SMD-0106-Cronica_00h00m13s730.wav,tu'un te'ei' ka'-nti ji'i-an samatxi nche'e,1,1,4,6,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SMD-0106-Cronica_00h00m18s735.wav,chinteini-nto kixa-nto ñuu yo'o sa'a-nto ña ki...,1,1,4,9,0,6,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11688,SMD-0027-Tablita_00h00m13s815.wav,ìñù jí'in ìvì kíí ùnà,0,0,0,4,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
11689,SMD-0027-Tablita_00h00m15s440.wav,ùnà jí'in ùnà kíí xà'ùn iin,0,0,0,5,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
11690,SMD-0027-Tablita_00h00m17s775.wav,ntava vìtì lo'o chìì xàà ìntava va yí'ì,1,1,5,7,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
11691,SMD-0027-Tablita_00h00m21s280.wav,ntava tiki ìnkana'a,0,0,5,2,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0


Show character set with corresponding column number in `raw_chars_df` and total counts

In [15]:
# Without descriptions
raw_charset.print_charset()

There are 106 unique characters in the raw text (Column number of raw_chars_df: Character, Description):

(2: 'c', 5209), (3: 'h', 6456), (4: 'a', 64568), (5: ' ', 74463), (6: 'S', 117), (7: 'n', 41994), (8: 'd', 1509), (9: 'r', 5078), (10: ',', 687), (11: 'C', 43), (12: 'm', 3578), (13: 'e', 7793), (14: 'k', 22372), (15: 'o', 17698), (16: 'ñ', 7073), (17: ''', 17467), (18: 'i', 49366), (19: 'x', 11864), (20: 't', 25520), (21: '-', 13939), (22: 'u', 15335), (23: 's', 10095), (24: 'M', 124), (25: 'D', 30), (26: 'z', 103), (27: 'j', 3103), (28: 'l', 2724), (29: 'y', 4162), (30: 'f', 206), (31: 'v', 10293), (32: 'J', 44), (33: 'p', 879), (34: 'A', 17), (35: 'b', 305), (36: 'G', 37), (37: 'ú', 800), (38: 'T', 35), (39: '(', 94), (40: ')', 94), (41: 'P', 46), (42: 'q', 81), (43: 'g', 470), (44: 'I', 6), (45: 'E', 16), (46: 'B', 9), (47: 'Z', 6), (48: 'F', 5), (49: 'V', 18), (50: '.', 84), (51: 'í', 5596), (52: '¿', 2), (53: '?', 53), (54: 'N', 72), (55: 'ò', 1391), (56: 'ù', 1372), (57: 'ì'

In [16]:
# With descriptions
raw_charset.print_charset(with_descriptions=True)

There are 106 unique characters in the raw text (Column number of raw_chars_df: Character, No. of occurrences):

(2: 'c', LATIN SMALL LETTER C), (3: 'h', LATIN SMALL LETTER H), (4: 'a', LATIN SMALL LETTER A), (5: ' ', SPACE), (6: 'S', LATIN CAPITAL LETTER S), (7: 'n', LATIN SMALL LETTER N), (8: 'd', LATIN SMALL LETTER D), (9: 'r', LATIN SMALL LETTER R), (10: ',', COMMA), (11: 'C', LATIN CAPITAL LETTER C), (12: 'm', LATIN SMALL LETTER M), (13: 'e', LATIN SMALL LETTER E), (14: 'k', LATIN SMALL LETTER K), (15: 'o', LATIN SMALL LETTER O), (16: 'ñ', LATIN SMALL LETTER N WITH TILDE), (17: ''', APOSTROPHE), (18: 'i', LATIN SMALL LETTER I), (19: 'x', LATIN SMALL LETTER X), (20: 't', LATIN SMALL LETTER T), (21: '-', HYPHEN-MINUS), (22: 'u', LATIN SMALL LETTER U), (23: 's', LATIN SMALL LETTER S), (24: 'M', LATIN CAPITAL LETTER M), (25: 'D', LATIN CAPITAL LETTER D), (26: 'z', LATIN SMALL LETTER Z), (27: 'j', LATIN SMALL LETTER J), (28: 'l', LATIN SMALL LETTER L), (29: 'y', LATIN SMALL LETTER Y), 

Show texts with certain characters (use column numbers of `raw_chars_df`):

In [17]:
raw_charset.show_texts_with_chars([63, 107])

Unnamed: 0,id,text,́,Ñ
301,SMD-0046-Pollo_00h00m07s215.wav,taa yó'o-o ntóo-nti ve'e náná-ì ñà kíí...,10,0
302,SMD-0046-Pollo_00h00m10s480.wav,taa vitxi ntikani-án lo'o nùù-ntí ña kíí,4,0
303,SMD-0046-Pollo_00h00m13s035.wav,ña kíí nishi ñà'a káa kùñu ntuxí,4,0
304,SMD-0046-Pollo_00h00m14s900.wav,távà na koan kuntàa-ti ini tatùun kìì ña...,1,0
305,SMD-0046-Pollo_00h00m16s785.wav,kíí míí kùñu-ti míí-ó chii sáá ni yo...,15,0
...,...,...,...,...
11193,SMD-0094-Tino_00h06m34s200.wav,xaa xika-ra skuela ji'in tarea-ra nchikuiva ch...,2,0
11195,SMD-0094-Tino_00h06m44s625.wav,aa tí koo ixinti-ra ku'un-ra nantxo nixii s'a...,1,0
11198,SMD-0094-Tino_00h07m00s721.wav,kua'an ni (saan intixa-i ta) ña'a saan ikisi-n...,2,0
11202,SMD-0094-Tino_00h07m20s847.wav,xixika-i ku'uan va-i ji'in skuélá-ì ni,2,0


## Process texts for ASR

### Try to normalize to meaningful character set 

In [18]:
# Manually enter this part so you can icrementally build up the expected character set

# Space + Consonants + 5 vowels x 3 tones
expected_charset = set(' ' + "'bcdfghjklmnNpqrstvwxyz"). \
    union(set('aA@' + 'eE#' + 'iI®' + 'oO%' + 'uU&'))

print(sorted(list(expected_charset)))

[' ', '#', '%', '&', "'", '@', 'A', 'E', 'I', 'N', 'O', 'U', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '®']


In [29]:
from unicodedata import normalize

removals_regex = [
    r"[.,;:]",        # Remove punctuation
    r'"',             # Remove quotes
    r"[¡!]",          # Remove exlamations
    r"\t+",           # Remove tabs
    r'[$#@/\*&\^\+=]' # Remove other symbols
]

replacements_regex = {
    # Collapse char+modifier unicode sequences into a single unicode char
    'á': 'A', 'é': 'E', 'í': 'I', 'ó': 'O', 'ú': 'U',        # High tone (acute)
    'à': '@', 'è': '#', 'ì': '®', 'ò': '%', 'ù': '&',        # Low tone (grave)
    'â': 'A@', 'ê': '#E', 'î': '®I', 'ô': '%O', 'û': '&U',   # Falling, HL (circumflex)
    'ǎ': '@A', 'ě': 'E#', 'ǐ': 'I®', 'ǒ': 'O%', 'ǔ': 'U&',   # Rising, LH (caron)
    'ñ': 'N',
    # Map affixes and new lines into a space
    '-': ' ',
    '\n': ' '
 }

# Lower case raw_text and put in a new column 'norm_text' column
normed_data = raw_data.assign(norm_text = lambda df: df.raw_text.str.lower())
# NFKC = compose diacritics and characters
normed_data.norm_text = normed_data.norm_text.apply(lambda str: normalize('NFKC', str))

for k, v in [ (r, '') for r in removals_regex ] + list(replacements_regex.items()):
    normed_data.norm_text = normed_data.norm_text.str.replace(k, v, regex=True)

normed_data.norm_text

normed_data[['annotation_id', 'raw_text', 'norm_text']]

Unnamed: 0,annotation_id,raw_text,norm_text
0,SMD-0106-Cronica_00h00m00s020.wav,"chaa Sandra, chaa Carmen",chaa sandra chaa carmen
1,SMD-0106-Cronica_00h00m04s820.wav,"koña'a ña Carmen iin, xinteini-nto ikixa-nto ñ...",koNa'a Na carmen iin xinteini nto ikixa nto Nu...
2,SMD-0106-Cronica_00h00m10s050.wav,San Martin Duraznos ka'-nti ji'i-an tu'un stila,san martin duraznos ka' nti ji'i an tu'un stila
3,SMD-0106-Cronica_00h00m13s730.wav,tu'un te'ei' ka'-nti ji'i-an samatxi nche'e,tu'un te'ei' ka' nti ji'i an samatxi nche'e
4,SMD-0106-Cronica_00h00m18s735.wav,chinteini-nto kixa-nto ñuu yo'o sa'a-nto ña ki...,chinteini nto kixa nto Nuu yo'o sa'a nto Na ki...
...,...,...,...
12790,SMD-0027-Tablita_00h00m13s815.wav,ìñù jí'in ìvì kíí ùnà,®N& jI'in ®v® kII &n@
12791,SMD-0027-Tablita_00h00m15s440.wav,ùnà jí'in ùnà kíí xà'ùn iin,&n@ jI'in &n@ kII x@'&n iin
12792,SMD-0027-Tablita_00h00m17s775.wav,ntava vìtì lo'o chìì xàà ìntava va yí'ì,ntava v®t® lo'o ch®® x@@ ®ntava va yI'®
12793,SMD-0027-Tablita_00h00m21s280.wav,ntava tiki ìnkana'a,ntava tiki ®nkana'a


In [30]:
normed_charset = CharSetExplorer(normed_data, 'norm_text', 'annotation_id')

  0%|                                                                     | 0/11693 [00:00<?, ?it/s]

In [31]:
normed_charset.print_charset(expected_charset=expected_charset)

There are 60 unique characters in the raw text (Column number of raw_chars_df: Character, Description):

(2: 'c', 5252), (3: 'h', 6465), (4: 'a', 62617), (5: ' ', 88670), (6: 's', 10212), (7: 'n', 42028), (8: 'd', 1539), (9: 'r', 5086), (10: 'm', 3702), (11: 'e', 7673), (12: 'k', 22389), (13: 'o', 16856), (14: 'N', 7112), (15: ''', 17467), (16: 'i', 47517), (17: 'x', 11868), (18: 't', 25555), (19: 'u', 15031), (20: 'z', 109), (21: 'j', 3147), (22: 'l', 2748), (23: 'y', 4166), (24: 'f', 211), (25: 'v', 10311), (26: 'p', 925), (27: 'b', 314), (28: 'g', 507), (29: 'U', 892), (30: '(', 94), (31: ')', 94), (32: 'q', 82), (33: 'I', 6469), (34: '¿', 2), (35: '?', 53), (36: '%', 1795), (37: '&', 1591), (38: '®', 6135), (39: 'E', 898), (40: '@', 7596), (41: 'A', 5421), (42: 'O', 2290), (43: '#', 290), (44: '[', 50), (45: ']', 50), (46: '{', 18), (47: '}', 18), (48: '́', 13), (49: '2', 17), (50: '8', 3), (51: '0', 8), (52: '1', 17), (53: '7', 3), (54: '5', 1), (55: '6', 8), (56: '’', 6), (57: '9

In [32]:
normed_charset.show_texts_with_chars([45])

Unnamed: 0,id,text,]
308,SMD-0046-Pollo_00h00m24s880.wav,[saan nishi] Na kII N@ sA'a nI j@@n nA ®?,1
337,SMD-0046-Pollo_00h05m18s930.wav,tat&un xitx®i an yO'o sata [shita sata] y%sO,1
360,SMD-0046-Pollo_00h07m18s650.wav,Nakan[ kue taa tatu ku] kuki rI kuentA,1
423,SMD-0046-Pollo_00h01m40s980.wav,ku@'@n ntxi'i ku#'# N@ yatA yO'o taa Na [taa],1
426,SMD-0046-Pollo_00h01m48s980.wav,taa N@ kII n&& [xin] mII n&& x®nti'i r® n&& Nu...,1
430,SMD-0046-Pollo_00h02m00s976.wav,taa intxi'i taa [ka] yO'o kana xitxi r®,1
477,SMD-0046-Pollo_00h04m34s085.wav,Na j@@n kII [ka] N@ y{O'o k}II k@gu@ r® taa N@...,1
481,SMD-0046-Pollo_00h04m45s605.wav,xitxi Na kA'nu yO'o suu N@ yO'o ukAxI n@ N@ yO...,1
1020,SMD-0091-Diablo_00h00m29s660.wav,skuElA kAa ta &'un kAa ®x%o ve'e tx®NA'A ta kA...,1
2289,SMD-0007-Accidente_00h00m07s431.wav,[Na'a taa k®®] k®® miErkolesh#,1


### Remove utterances that cannot be addressed by normalization process

In [33]:
# Fill this section in iteratively
removal_criteria = {
    # Anything with questions marks and enclosures appear to be where transcribers are unsure of the content
    'one_question': r'\?',
    'multiple_questions': r'\?{2,}',
    'has_enclosures':r'[\(\[\{\)\]\}]',
    # Digits aren't spelled out for ASR (e.g. 33 = thirty three) but there doesn't appear to be too many of them
    'has_digits': r'[\d]',
    # Some accents could not be addresed by the normalization process above
    'has_stray_accents': r'`|´|’| ̀|x̀| ́|Ó|Ó'
}

removal_candidates = dict([ (name, normed_data.norm_text.str.contains(pat=p, regex=True)) for (name, p) in removal_criteria.items() ])
total_removal_cands = 0

for removal_reason, removal_cands in removal_candidates.items():
    num_removal_cands = removal_cands.sum()
    total_removal_cands += num_removal_cands
    print(f"There are {num_removal_cands} rows satisfying removal reason: '{removal_reason}'")

print(f"\nTotal removal candidates: {total_removal_cands} = {total_removal_cands/len(normed_data) * 100:.2f}% of total data.")

There are 47 rows satisfying removal reason: 'one_question'
There are 3 rows satisfying removal reason: 'multiple_questions'
There are 151 rows satisfying removal reason: 'has_enclosures'
There are 24 rows satisfying removal reason: 'has_digits'
There are 25 rows satisfying removal reason: 'has_stray_accents'

Total removal candidates: 250 = 2.14% of total data.


In [34]:
final_data = normed_data[~pd.concat(removal_candidates.values(), axis=1).any(axis=1)]

final_data

Unnamed: 0,path,file,annotation_id,tier,start_ms,end_ms,raw_text,norm_text
0,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m00s020.wav,Transcription@PPM,20,4820,"chaa Sandra, chaa Carmen",chaa sandra chaa carmen
1,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m04s820.wav,Transcription@PPM,4820,9730,"koña'a ña Carmen iin, xinteini-nto ikixa-nto ñ...",koNa'a Na carmen iin xinteini nto ikixa nto Nu...
2,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m10s050.wav,Transcription@PPM,10050,13730,San Martin Duraznos ka'-nti ji'i-an tu'un stila,san martin duraznos ka' nti ji'i an tu'un stila
3,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m13s730.wav,Transcription@PPM,13730,18230,tu'un te'ei' ka'-nti ji'i-an samatxi nche'e,tu'un te'ei' ka' nti ji'i an samatxi nche'e
4,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0106-Cronica.wav,SMD-0106-Cronica_00h00m18s735.wav,Transcription@PPM,18735,23495,chinteini-nto kixa-nto ñuu yo'o sa'a-nto ña ki...,chinteini nto kixa nto Nuu yo'o sa'a nto Na ki...
...,...,...,...,...,...,...,...,...
12790,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0027-Tablita.wav,SMD-0027-Tablita_00h00m13s815.wav,Transcription@CHM,13815,15375,ìñù jí'in ìvì kíí ùnà,®N& jI'in ®v® kII &n@
12791,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0027-Tablita.wav,SMD-0027-Tablita_00h00m15s440.wav,Transcription@CHM,15440,17240,ùnà jí'in ùnà kíí xà'ùn iin,&n@ jI'in &n@ kII x@'&n iin
12792,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0027-Tablita.wav,SMD-0027-Tablita_00h00m17s775.wav,Transcription@CHM,17775,20855,ntava vìtì lo'o chìì xàà ìntava va yí'ì,ntava v®t® lo'o ch®® x@@ ®ntava va yI'®
12793,/users/anton/PycharmProjects/mixtec_analysis/c...,SMD-0027-Tablita.wav,SMD-0027-Tablita_00h00m21s280.wav,Transcription@CHM,21280,23110,ntava tiki ìnkana'a,ntava tiki ®nkana'a


### Processed dataset

Expect unexpected character set to be empty!

In [35]:
final_charset = CharSetExplorer(final_data, 'norm_text', 'annotation_id')

  0%|                                                                     | 0/11448 [00:00<?, ?it/s]

In [36]:
final_charset.print_charset(expected_charset=expected_charset)

There are 39 unique characters in the raw text (Column number of raw_chars_df: Character, Description):

(2: 'c', 5108), (3: 'h', 6314), (4: 'a', 61038), (5: ' ', 86144), (6: 's', 9894), (7: 'n', 40964), (8: 'd', 1482), (9: 'r', 4897), (10: 'm', 3576), (11: 'e', 7361), (12: 'k', 21775), (13: 'o', 16412), (14: 'N', 6894), (15: ''', 17034), (16: 'i', 46429), (17: 'x', 11586), (18: 't', 24868), (19: 'u', 14683), (20: 'z', 103), (21: 'j', 3073), (22: 'l', 2646), (23: 'y', 4055), (24: 'f', 196), (25: 'v', 10111), (26: 'p', 883), (27: 'b', 298), (28: 'g', 491), (29: 'U', 841), (30: 'q', 76), (31: 'I', 6250), (32: '%', 1733), (33: '&', 1510), (34: '®', 5936), (35: 'E', 875), (36: '@', 7316), (37: 'A', 5245), (38: 'O', 2200), (39: '#', 276), (40: 'w', 2)


The following do not appear in the expected charset:



### Export for ASR

#### Extract clips from recordings

In [37]:
import os

ffmpeg_df = final_data.assign(
    wav_path = lambda x: [ f"{p}/{f}" for (p, f) in zip(x.path, x.file) ],
    start_s = lambda x: round(x.start_ms / 1000, 1),
    dur_s   = lambda x: round((x.end_ms - x.start_ms) / 1000, 1),
    clip_id = lambda x: x.annotation_id
)[['wav_path', 'start_s', 'dur_s', 'clip_id']]

assert all([ os.path.exists(p) for p in ffmpeg_df.wav_path ])

ffmpeg_df.to_csv('data/interim/ffmpeg_mixtec-split.csv', header=False, index=False)

ffmpeg_df

Unnamed: 0,wav_path,start_s,dur_s,clip_id
0,/users/anton/PycharmProjects/mixtec_analysis/c...,0.0,4.8,SMD-0106-Cronica_00h00m00s020.wav
1,/users/anton/PycharmProjects/mixtec_analysis/c...,4.8,4.9,SMD-0106-Cronica_00h00m04s820.wav
2,/users/anton/PycharmProjects/mixtec_analysis/c...,10.0,3.7,SMD-0106-Cronica_00h00m10s050.wav
3,/users/anton/PycharmProjects/mixtec_analysis/c...,13.7,4.5,SMD-0106-Cronica_00h00m13s730.wav
4,/users/anton/PycharmProjects/mixtec_analysis/c...,18.7,4.8,SMD-0106-Cronica_00h00m18s735.wav
...,...,...,...,...
12790,/users/anton/PycharmProjects/mixtec_analysis/c...,13.8,1.6,SMD-0027-Tablita_00h00m13s815.wav
12791,/users/anton/PycharmProjects/mixtec_analysis/c...,15.4,1.8,SMD-0027-Tablita_00h00m15s440.wav
12792,/users/anton/PycharmProjects/mixtec_analysis/c...,17.8,3.1,SMD-0027-Tablita_00h00m17s775.wav
12793,/users/anton/PycharmProjects/mixtec_analysis/c...,21.3,1.8,SMD-0027-Tablita_00h00m21s280.wav


Split file using GNU parallel and ffmpeg

```bash
# Create output dir
mkdir -p data/processed/20230818_mixtec

# Loop through each row of ffmpeg_mixtec-split.csv and extract each clip into individual files and write to data/processed/20230818_mixtec/clips/
cat data/interim/ffmpeg_mixtec-split.csv |
    parallel --progress \
    --results tmp/ffmpeg_results_mixtec.csv \
    --colsep ',' \
    ffmpeg -loglevel error -hide_banner -nostats -y -i {1} -ss {2} -t {3} data/processed/20230818_mixtec/clips/{4}
```

In [38]:
# Check none of the ffmpeg jobs failed
all(pd.read_csv("tmp/ffmpeg_results_mixtec.csv").Stderr.isna())

FileNotFoundError: [Errno 2] No such file or directory: 'tmp/ffmpeg_results_mixtec.csv'

In [39]:
# Are there the expected number of clips in data/processed/20230818_mixtec/clips?
!ls data/processed/20230818_mixtec/clips | wc -l

ls: data/processed/20230818_mixtec/clips: No such file or directory
       0


#### Export ASR training manifest

In [40]:
manifest_tsv = final_data.assign(
    path=lambda x: 'clips/' + x.annotation_id,
    text=lambda x: x.norm_text
)[['path', 'text']]

manifest_tsv.to_csv("data/processed/20230818_mixtec/_all.tsv", sep="\t", index=False)

manifest_tsv

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/20230818_mixtec/_all.tsv'