In [19]:
import pandas as pd
import shutil
import glob
import os
import tgt
from pathlib import Path

In [2]:
test_utt_tsv = 'out/results/test_file_stats.tsv'
test_df = pd.read_csv(test_utt_tsv, sep='\t')

In [3]:
with open('data/map/spkr_male.txt', 'r') as f:
    male_spkrs = [line.strip() for line in f.readlines()]
    
with open('data/map/spkr_female.txt', 'r') as f2:
    female_spkrs = [line.strip() for line in f2.readlines()]

In [4]:
utt_to_lang_dict = pd.Series(test_df.lang.values,index=test_df.utt_id).to_dict()

In [5]:
utt_to_lang_dict

{'0001_DoReCo_doreco_urum1249_UUM-TXT-AN-00000-A01': 'urum',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-AN-00000-A02': 'urum',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-AN-00000-A03': 'cs',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-AN-00000-B08': 'urum',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-AN-00000-B13': 'urum',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-AN-00000-B16': 'russ',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-CL-00000-A03': 'urum',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-CL-00000-A04': 'urum',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-CL-00000-A05': 'urum',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-CL-00000-A07': 'cs',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-CL-00000-A10': 'urum',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-FE-00000-A01': 'cs',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-FE-00000-A02': 'cs',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-FE-00000-A03': 'cs',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-FM-00000-A01': 'cs',
 '0001_DoReCo_doreco_urum1249_UUM-TXT-FM-00000-A02': 'urum',
 '0001_DoReCo_doreco_urum1249_UUM-TX

In [24]:
# create new dirs and copy relevant files
def create_dirs_copy(out_dir, tg_dir, remove_phones=False):
    # separate gold utts into separate folders based on male/female spkr & Urum/CS
    sub_dirs = [f'{out_dir}/urum/f/', f'{out_dir}/urum/m/', 
                f'{out_dir}/cs/f/', f'{out_dir}/cs/m/']
    for sub_dir in sub_dirs:
        Path(sub_dir).mkdir(parents=True, exist_ok=True)
        
    for gold_file in glob.glob(f'{tg_dir}/*.TextGrid'):
        gold_utt = os.path.basename(gold_file).split('.')[0]
        if gold_utt not in utt_to_lang_dict:
            continue

        gold_lang = utt_to_lang_dict[gold_utt]
        if gold_lang == 'russ':
            continue

        gold_spkr = gold_utt.split('-')[-1]  # e.g. A03
        if gold_spkr in male_spkrs:
            sex = 'm'
        elif gold_spkr in female_spkrs:
            sex = 'f'
        else:
            print(f'ERROR: spkr ({gold_spkr}) sex not categorized')

        copy_to_dir = f'{out_dir}/{gold_lang}/{sex}'
        if not remove_phones:
            shutil.copy(gold_file, copy_to_dir)
            continue
            
        try:
            tgt_grid = tgt.io.read_textgrid(gold_file, encoding='utf-16')
        except:
            try:
                tgt_grid = tgt.io.read_textgrid(gold_file, encoding='utf-8')        
            except:
                print(f'** SKIP (encoding issues): {gold_utt}')
                continue

        # remove phones for which their words contain _ ex. "fs_elädır"
        word_tier = tgt_grid.get_tier_by_name('words')
        phone_tier = tgt_grid.get_tier_by_name('phones')
        for word in word_tier:
            if '_' in word.text:  # either foreign (starts with _) or other
                phone_tier.delete_annotations_between_timepoints(word.start_time, 
                                                                 word.end_time, 
                                                                 left_overlap=False, 
                                                                 right_overlap=False)

        out_file = f'{copy_to_dir}/{gold_utt}.TextGrid'
        tgt.io.write_to_file(tgt_grid, out_file)

In [None]:
# (1) gold
textgrid_dir = 'data/gold_utts'
out_dir = 'data/case_study/gold_test'
create_dirs_copy(out_dir, textgrid_dir, remove_phones=False)

In [25]:
# (2) best (precision) model; 2nd best accuracy model
textgrid_dir = 'out/tg/russmfa-all-urum-phones-orig'
out_dir =  'data/case_study/russmfa-all-urum-phones'
create_dirs_copy(out_dir, textgrid_dir, remove_phones=True)

In [30]:
# (3) worst model
textgrid_dir = 'out/tg/cs-urum-phones3'
out_dir = 'data/case_study/cs-urum-phones3'
create_dirs_copy(out_dir, textgrid_dir, remove_phones=True)

In [31]:
# (4) best Eng MFA (all-urum)
textgrid_dir = 'out/tg/engmfa-all-urum-phones-orig'
out_dir = 'data/case_study/engmfa-all-urum-phones-orig'
create_dirs_copy(out_dir, textgrid_dir, remove_phones=True)

In [32]:
# (4) best train-from-scratch (all-urum)
textgrid_dir = 'out/tg/all-urum-phones3'
out_dir = 'data/case_study/all-urum-phones3'
create_dirs_copy(out_dir, textgrid_dir, remove_phones=True)

In [28]:
russ_xs_list = ['a', 'b', 'd', 'e', 'f', 'g', 'x', 'i', 'z`', 'k', '5', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'j', 'z']
xsampa_map_file = 'data/map/cs-xsampa-to-ipa.map'
xsipa_dict = {}

with open(xsampa_map_file, 'r') as f:
    for line in f.readlines():
        items = line.strip().split('\t')
        if items:
            xsipa_dict[items[0]] = items[1]

for ipa in [xsipa_dict[russ_xs] for russ_xs in russ_xs_list]:
    print(ipa)

a
b
d
e
f
ɡ
x
i
ʐ
k
ɫ
m
n
o
p
r
s
t
u
v
j
z
