In [13]:
import os
import pandas as pd
import re
import shutil
from audiolabel import read_label


# Identify the name of the .TextGrid file (manual)
fname = "05OCT2013_OttoGwadoAyoker_DengsFish_10SEPT21.TextGrid"
storyname = "DengsFish"

# Read .TextGrid tiers into temporary tables
[phon, gloss, orth, trans] = read_label(fname,
                                        ftype='praat', 
                                        tiers=['phon', 'gloss', 'orth', 'trans'])


# Save the tier tables to local .csv files
# Read .csv files as DataFrame's
pname = storyname + "_scratch"
if not os.path.exists(pname):
    os.makedirs(pname)

phon.to_csv(pname + '/phon.csv', index=False)
phondf = pd.read_csv(pname + '/phon.csv')

gloss.to_csv(pname + '/gloss.csv', index=False)
glossdf = pd.read_csv(pname + '/gloss.csv')

orth.to_csv(pname + '/orth.csv', index=False)
orthdf = pd.read_csv(pname + '/orth.csv')

trans.to_csv(pname + '/trans.csv', index=False)
transdf = pd.read_csv(pname + '/trans.csv')


# Drop fname column and rename time columns
phondf = phondf.drop('fname', axis=1).rename({'t1': 't1_phon', 't2': 't2_phon'}, axis='columns')
glossdf = glossdf.drop('fname', axis=1).rename({'t1': 't1_gloss', 't2': 't2_gloss'}, axis='columns')
orthdf = orthdf.drop('fname', axis=1).rename({'t1': 't1_orth', 't2': 't2_orth'}, axis='columns')
transdf = transdf.drop('fname', axis=1).rename({'t1': 't1_trans', 't2': 't2_trans'}, axis='columns')

In [14]:
# Merge the four DataFrame's into pgot
pg = pd.merge_asof(
    phondf,
    glossdf,
    left_on='t1_phon',
    right_on='t1_gloss'
)
pgo = pd.merge_asof(
    pg,
    orthdf,
    left_on='t1_phon',
    right_on='t1_orth'
)
pgot = pd.merge_asof(
    pgo,
    transdf,
    left_on='t1_phon',
    right_on='t1_trans'
)

#pgot.head(20)

In [15]:
# Drop the NaN values from pgot
pgot_dropped = pgot.dropna(subset=["phon", "gloss", "orth", "trans"])

#pgot_dropped.head(20)

In [16]:
# Standardize IPA symbols

ipa_dict = pd.read_csv("ipa_dict.csv")
columns = ["phon", "gloss", "orth", "trans"]

# To prevent warning messages on modifying the DataFrame
pgot_dropped = pgot_dropped.copy()

for col in columns:
    for i, row in ipa_dict.iterrows():
        pgot_dropped[col] = pgot_dropped[col].apply(lambda x: re.sub(row["regex"], row["ipa"], x))
        

pgot_dropped.head(20)

Unnamed: 0,t1_phon,t2_phon,phon,t1_gloss,t2_gloss,gloss,t1_orth,t2_orth,orth,t1_trans,t2_trans,trans
1,4.561792,5.130023,ácàaarɔ̀,4.561792,5.130023,Acaar,4.561792,8.019766,"Acaarɔ paa yaa Deng, ba pac me dwøng o ba pac ...",4.561792,8.019766,"Acaar, the village of Deng, is a big village a..."
2,5.130023,5.335282,pǎaa,5.130023,5.335282,village:POSD.PL,4.561792,8.019766,"Acaarɔ paa yaa Deng, ba pac me dwøng o ba pac ...",4.561792,8.019766,"Acaar, the village of Deng, is a big village a..."
3,5.335282,5.520483,jấā,5.335282,5.520483,people:CS,4.561792,8.019766,"Acaarɔ paa yaa Deng, ba pac me dwøng o ba pac ...",4.561792,8.019766,"Acaar, the village of Deng, is a big village a..."
4,5.520483,6.042057,dɛ̌ɛɛŋ,5.520483,6.042057,Deng:ASSOC,4.561792,8.019766,"Acaarɔ paa yaa Deng, ba pac me dwøng o ba pac ...",4.561792,8.019766,"Acaar, the village of Deng, is a big village a..."
5,6.042057,6.223871,bǎa,6.042057,6.223871,NPRED,4.561792,8.019766,"Acaarɔ paa yaa Deng, ba pac me dwøng o ba pac ...",4.561792,8.019766,"Acaar, the village of Deng, is a big village a..."
6,6.223871,6.448524,pâac,6.223871,6.448524,village,4.561792,8.019766,"Acaarɔ paa yaa Deng, ba pac me dwøng o ba pac ...",4.561792,8.019766,"Acaar, the village of Deng, is a big village a..."
7,6.448524,6.855015,mɛ́-dwɔ̂́ɔŋ,6.448524,6.855015,ATTR:SG-big,4.561792,8.019766,"Acaarɔ paa yaa Deng, ba pac me dwøng o ba pac ...",4.561792,8.019766,"Acaar, the village of Deng, is a big village a..."
8,6.855015,6.916646,ʊ̀,6.855015,6.916646,CONJ,4.561792,8.019766,"Acaarɔ paa yaa Deng, ba pac me dwøng o ba pac ...",4.561792,8.019766,"Acaar, the village of Deng, is a big village a..."
9,6.916646,7.085922,bǎa,6.916646,7.085922,NPRED,4.561792,8.019766,"Acaarɔ paa yaa Deng, ba pac me dwøng o ba pac ...",4.561792,8.019766,"Acaar, the village of Deng, is a big village a..."
10,7.085922,7.295949,pâac,7.085922,7.295949,village:SG,4.561792,8.019766,"Acaarɔ paa yaa Deng, ba pac me dwøng o ba pac ...",4.561792,8.019766,"Acaar, the village of Deng, is a big village a..."


In [17]:
# Save pgot_dropped to local .csv
pgot_dropped.to_csv(storyname + ".csv")

In [18]:
# Delete the scratch folder
if shutil.os.path.exists(pname):
    shutil.rmtree(pname)

In [7]:
# Phase II

# Concatenate phons, glosses, and orths for each trans
con_phon = pgot_dropped.groupby('trans', sort=False)['phon'].apply(lambda x: ' '.join(x)).reset_index()
con_gloss = pgot_dropped.groupby('trans', sort=False)['gloss'].apply(lambda x: ' '.join(x)).reset_index()
unique_orth_df = pgot_dropped[['trans', 'orth']].drop_duplicates()
con_orth = unique_orth_df.groupby('trans', sort=False)['orth'].apply(lambda x: ' '.join(x)).reset_index()

In [10]:
# Building .json for the story
sentences = {}
rows = []
sentences['rows'] = rows
num = con_phon.shape[0]

for i in range(num):
    sentence = {}
    sentence_id = storyname + '_' + str(i)
    sentence['id'] = sentence_id
    sentence['key'] = [storyname, sentence_id, sentence_id]
    
    value = {}
    value['story'] = storyname
    
    sentence_content = {}
    sentence_content['utterance'] = con_orth["orth"][i]
    sentence_content['morphemes'] = con_phon["phon"][i]
    sentence_content['gloss'] = con_gloss["gloss"][i]
    sentence_content['translation'] = con_orth["trans"][i]
    value['sentence'] = sentence_content
    
    sentence['value'] = value
    rows.append(sentence)

dictionary = {}
dictionary['total_rows'] = num
dictionary['offset'] = 0
dictionary['rows'] = rows

import json
json_file_path = storyname + '.json'
with open(json_file_path, 'w') as json_file:
    json.dump(dictionary, json_file)