In [1]:
%%bash

. ./cmd.sh
. ./path.sh
set -e # exit on error

FSCDATATOP=data/raw/FSC

#File check
tree -L 1 $FSCDATATOP

#Total Duration of 363 files: 141:16:24.01
find $FSCDATATOP/* -iname "*.wav" | sed "s/ /\\\\ /g" | xargs soxi | tail -n1

data/raw/FSC
|-- FSC_Readme.txt
|-- Volume\ 1
|-- Volume\ 2
|-- Volume\ 3
|-- Volume\ 4
|-- Volume\ 5\ (Spontaneous\ Speech)
`-- Volume\ 6\ (Transcriptions)

6 directories, 1 file
Total Duration of 363 files: 141:16:24.01


In [2]:
%%bash

# found duplicated transcription files (use only the machine pre-segmented ones)

. ./cmd.sh
. ./path.sh
set -e # exit on error

FSCDATATOP=data/raw/FSC

tot_count=363
echo "Total count from previous cell:" $tot_count
echo
echo "Duplicate transcripts:"
for file in $(find $FSCDATATOP/* -iname "[^cw]*.trs" | sed 's/ /\\\\ /g' | xargs -n1 basename |\
              sort --parallel=8 | uniq -d);
do
    find $FSCDATATOP/* -iname "$file";
done | sed 's/ /\\ /g'

echo
echo "=== MORE STATS ==="
trs_count=$(find $FSCDATATOP/* -iname "[^cw]*.trs" | sed 's/ /\\ /g' | xargs -n1 basename |\
    sort --parallel=8 | uniq | wc -l )
echo "Total transcripts available:" $trs_count

match_count=$(cat <(find data/raw/FSC/* -iname "[^cw]*.trs" | sed 's/ /\\ /g' | xargs -n1 basename |\
                    sort --parallel=8 | uniq | sed 's/\.trs//g') \
                  <(find data/raw/FSC/* -iname "*.wav" | sed 's/ /\\ /g' | xargs -n1 basename | sed 's/\.wav//g') \
                  | sort --parallel=8 | uniq -d | wc -l )
echo "Missing recordings with transcripts:" $((trs_count - match_count))

cat <(find data/raw/FSC/* -iname "*.wav" | sed 's/ /\\ /g' | xargs -n1 basename | sed 's/\.wav//g') \
    <(find data/raw/FSC/* -iname "*.wav" | sed 's/ /\\ /g' | xargs -n1 basename | sed 's/\.wav//g') \
    <(find data/raw/FSC/* -iname "[^cw]*.trs" | sed 's/ /\\ /g' | xargs -n1 basename | sed 's/\.trs//g') |\
    sort --parallel=8 | uniq -u

echo "Recordings without transcripts:" $((tot_count - match_count))

cat <(find data/raw/FSC/* -iname "*.wav" | sed 's/ /\\ /g' | xargs -n1 basename | sed 's/\.wav//g') \
    <(find data/raw/FSC/* -iname "[^cw]*.trs" | sed 's/ /\\ /g' | xargs -n1 basename | sed 's/\.trs//g') \
    <(find data/raw/FSC/* -iname "[^cw]*.trs" | sed 's/ /\\ /g' | xargs -n1 basename | sed 's/\.trs//g') |\
    sort --parallel=8 | uniq -u > $FSCDATATOP/missing_transcripts.txt
echo $FSCDATATOP"/missing_transcripts.txt"
echo

Total count from previous cell: 363

Duplicate transcripts:
data/raw/FSC/Volume\ 1/35_xx00xxxx_15x/00_xx10xxxx_15A.trs
data/raw/FSC/Volume\ 6\ (Transcriptions)/hand\ transcribed\ read\ speech/00_xx10xxxx_15A.trs
data/raw/FSC/Volume\ 6\ (Transcriptions)/hand\ transcribed\ read\ speech/07_xx00xxxx_13B.trs
data/raw/FSC/Volume\ 6\ (Transcriptions)/machine\ pre-segmented\ transcribed\ read\ speech/07_xx00xxxx_13B.trs
data/raw/FSC/Volume\ 1/35_xx00xxxx_15x/09_xx00xxxx_15A.trs
data/raw/FSC/Volume\ 6\ (Transcriptions)/hand\ transcribed\ read\ speech/09_xx00xxxx_15A.trs
data/raw/FSC/Volume\ 6\ (Transcriptions)/hand\ transcribed\ read\ speech/10_xx00xxxx_12B.trs
data/raw/FSC/Volume\ 6\ (Transcriptions)/machine\ pre-segmented\ transcribed\ read\ speech/10_xx00xxxx_12B.trs
data/raw/FSC/Volume\ 1/35_xx00xxxx_15x/13_xx01xxxx_15A.trs
data/raw/FSC/Volume\ 6\ (Transcriptions)/hand\ transcribed\ read\ speech/13_xx01xxxx_15A.trs
data/raw/FSC/Volume\ 1/35_xx00xxxx_15x/15_xx10xxxx_15A.trs
data/raw/FSC/Volu

In [None]:
%%time
%%bash

# found duplicated transcription files

. ./cmd.sh
. ./path.sh
set -e # exit on error

FSCDATATOP=data/raw/FSC

# 68_xx00xxxx_14PB and 69_xx00xxxx_11PB have corresponding recordings: replace PB with PBAS
# Total number of trainable recordings: 295 - 7 + 2 = 290
# Check: 363 total files - 75 + 2 = 290 (yay!)
# Total Duration of 290 files: 120:28:47.60

# WARNING!!! 04_xx10xxxx_22B transcription is misaligned

In [12]:
%%bash

head -n15 "data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed spontaneous speech/90_xx00xxxx_11speech.trs" |\
awk '{if (($0 ~ /Sync/) || ($0 ~ /^[^<]/)) print $0;}' | sed 's/\r\?$/\t/g' | tr -d '\n' |\
sed 's?\(<Sync time="[0-9.]*"/>\)?\1\n\1?g' | awk -F'\t' '{if (NF == 3) printf "%s\t%s\t%s\n",$1,$3,tolower($2)}'

<Sync time="0"/>	<Sync time="21.317"/>	ako po si jm pero ang tunay kong pangalan ay jeremiah magoncia ako ay tubong taga [tak] tubong tacloban sa leyte pero bago {bago} kami lumipat sa tacloban nakatira muna kami sa southern leyte
<Sync time="21.317"/>	<Sync time="21.69"/>	..
<Sync time="21.69"/>	<Sync time="34.883"/>	nasa san francisco hab barangay mariag kami nakatira don ang lugar na iyan ay kung inyong maaalala ay kung saan nangyari ang landslide 
<Sync time="34.883"/>	<Sync time="35.144"/>	..


In [3]:
%%time
%%bash

# example mapping of utterance list to corresponding transcription from trs files
# the variable duration needs to be set (default below: 400 seconds)
#sample="data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed read speech/00_xx10xxxx_15A.trs"
sample="data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed spontaneous speech/90_xx00xxxx_11speech.trs"
cat "$sample" | awk '{if (($0 ~ /Sync/) || ($0 ~ /^[^<]/)) print $0;}' | sed 's/\r\?$/\t/g' | tr -d '\n' |\
sed 's?\(<Sync time="[0-9.]*"/>\)?\1\n\1?g' | awk -F'\t' '{if (NF == 3) printf "%s\t%s\t%s\n",$1,$3,tolower($2)}' |\
    sed 's?<Sync time="\([^"]*\)"/>?\1?g;' | awk -F'\t' -vlabel=test -vdura='400' \
    '{if ($2 ~ /^$/) {$2=dura}; if ($1 < dura && $2 <= dura) {a+=1; printf "%s_%04d\t%s\t%s\t%s\n",label,a,$1,$2,$3} }'
echo

test_0001	0	21.317	ako po si jm pero ang tunay kong pangalan ay jeremiah magoncia ako ay tubong taga [tak] tubong tacloban sa leyte pero bago {bago} kami lumipat sa tacloban nakatira muna kami sa southern leyte
test_0002	21.317	21.69	..
test_0003	21.69	34.883	nasa san francisco hab barangay mariag kami nakatira don ang lugar na iyan ay kung inyong maaalala ay kung saan nangyari ang landslide 
test_0004	34.883	35.144	..
test_0005	35.144	45.132	nung high school ako ay nagaral sa isang paaralang napakaraming estudyante
test_0006	45.132	45.43	..
test_0007	45.43	57.318	lahat na atang uri ng estudyante ay nandon kaya ng nakapasok ako sa unibersidad ng pilipinas diliman parang walang pagbabagong nangyari ganun pa rin ang sitwasyon
test_0008	57.318	57.579	..
test_0009	57.579	61.306	anu pa bang sasabihin ko []
test_0010	61.306	61.716	..
test_0011	61.716	72.673	andito ako ngayon sa isang room {sa isang gusali} ng sa isang gusali dito sa up sa triple e
test_0012	72.673	72.822	..
test_0013	72.822	

In [4]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
set -e # exit on error

FSCDATATOP=data/raw/FSC
rm -fr data/FSC

counter=0
while IFS= read -r -d '' n; do
  sample="$n"
  dbname=$(basename $FSCDATATOP)
  filname=$(basename "$sample" .trs)
  settype=${dbname}_$(echo $filname | grep -o '[^0-9]\+$')
  #echo $settype
  targetdir=data/FSC/$settype/$filname

  # make folder
  rm -fr 
  mkdir -p $targetdir
  wavfile=$(find $FSCDATATOP/* -iname "$filname.wav")
  if [ -z "$wavfile" ]; then
    # work around for files 68 and 69
    wavfile=$(find $FSCDATATOP/* -iname "${filname}AS.wav")
  fi
  if [ -z "$wavfile" ]; then
    echo "Missing:" $sample
    continue
  fi
  echo "$wavfile" > $targetdir/${filname}-wav.list
  
  # get time limit of file
  #echo "$wavfile"
  duration=`soxi "$wavfile" | grep -o '[0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.[0-9][0-9]' |\
      awk -F':' '{s=$3+60*$2+3600*$1; print s}'`

  cat "$sample" | awk '{if (($0 ~ /Sync/) || ($0 ~ /^[^<]/)) print $0;}' | sed 's/\r\?$/\t/g' | tr -d '\n' |\
    sed 's?\(<Sync time="[0-9.]*"/>\)?\1\n\1?g' |\
    awk -F'\t' '{if (NF == 3) printf "%s\t%s\t%s\n",$1,$3,tolower($2)}' |\
    sed 's?<Sync time="\([^"]*\)"/>?\1?g;' | awk -F'\t' -vlabel=$filname -vdura=$duration \
    '{if ($2 ~ /^$/) {$2=dura}; \
      if ($1 < dura && $2 <= dura) {a+=1; printf "%s_%04d\t%s\t%s\t%s\n",label,a,$1,$2,$3} }' |\
    # substitutions here
    sed 's/{voc noise]\?}/+garbage+/g' |\
    sed 's/{nvc\(\| noise\)}/+garbage+/g' |\
    sed 's/[{([]\?\(background \|\)noise[]})]\?/ +garbage+ /g' |\
    sed 's/\[laugh\]/ +laugh+ /g' |\
    sed 's/\[breath\]/ +breath+ /g' |\
    # if you put this before breath, you will lose the transcribed breaths
    sed 's/\[[^][]*\]/ +garbage+ /g' |\
    # the order of the following 2 is important!
    sed 's/(non-fsc:\(.*\))/\1/g' |\
    sed 's/(\?n.*fsc.*/ +garbage+ /g' |\
    # handling parentheses
    awk -F'\t' '{if ($1 ~ /speech/) gsub(/\([^()]*\)/," +garbage+ ",$4); \
        else gsub(/\([^()]*\)/," ",$4); print $0;}' OFS='\t' |\
    # handling braces
    awk -F'\t' '{if ($1 !~ /speech/) gsub(/{[^{}]*}/," ",$4); print $0;}' OFS='\t' |\
    sed 's/{\([^{}]*\)}/ \1 /g' |\
    # CORNER CASES!!!!
    #awk -F'\t' '{if ($1 ~ /10_xx00xxxx_12A_1010/) $4=; print $0;}\
    # ensure nice spacing
    awk -F'\t' '{gsub(/^[ ]*/,"",$4); gsub(/[ ]+/," ",$4); print $0;}' OFS='\t' | sed 's/[ ]*$//g' \
    > $targetdir/${filname}-trn.txt

  counter=$((counter + 1))
#done < <(find $FSCDATATOP/Volume\ 6* -iname "01_xx01xxxx_14A.trs" -print0)
done < <(find $FSCDATATOP/Volume\ 6* -iname "[^cw]*.trs" -print0)
echo $counter

Missing: data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed read speech/11_xx10xxxx_11A.trs
Missing: data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed read speech/11_xx10xxxx_11B.trs
Missing: data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed read speech/49_xx10xxxx_14A.trs
Missing: data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed read speech/49_xx10xxxx_14B.trs
Missing: data/raw/FSC/Volume 6 (Transcriptions)/machine pre-segmented transcribed read speech/66_xx10xxxx_13PB.trs
293
CPU times: user 5.05 ms, sys: 180 µs, total: 5.23 ms
Wall time: 15.1 s


In [179]:
%%time
%%bash

####################
### FSC Analysis ###
####################

# Different approach for filtering

cat data/FSC/*/*/*-trn.txt |\
    # filter out simple entries: 2559 results
    #awk -F'\t' '{if ($4 !~ /^[ a-zA-Z0-9\.\-,?!]*$/) print $0;}' OFS='\t' |\
    # normalize instances of {voc noise} with +GARBAGE+ (there's one imbalanced ])
    #sed 's/{voc noise]\?}/+GARBAGE+/g' |\
    # normalize instances of {nvc noise} with +GARBAGE+
    #sed 's/{nvc\(\| noise\)}/+GARBAGE+/g' | grep -v '+GARBAGE+' |\
    # salvage non-fsc with correct suggested transcription
    #sed 's/(non-fsc:\(.*\))/\1/g' |\
    grep 'noise'

01_xx01xxxx_14A_0080	117.720	117.750	{noise}
02_xx00xxxx_11A_0070	118.58	119.165	(noise)
02_xx00xxxx_11A_0746	643.033	643.340	(noise)
02_xx00xxxx_11A_0748	643.720	644.386	(noise)
02_xx00xxxx_11A_1804	1300.125	1300.590	(noise)
03_xx10xxxx_11A_1462	1756.950	1757.510	{noise}
03_xx10xxxx_11A_1762	2004.660	2005.090	{noise}
04_xx10xxxx_22A_0048	72.450	72.990	{noise}
04_xx10xxxx_22A_0050	73.060	74.130	{noise}
04_xx10xxxx_22A_0913	587.930	588.420	{noise}
05_xx10xxxx_13A_0102	163.625	169.471	(noise)
06_xx00xxxx_12A_0064	96.180	98.220	bigyan mo ako niyang kinakain mo.(noise)
06_xx00xxxx_12A_0353	255.678	257.023	noise
06_xx00xxxx_12A_0636	395.345	396.048	malinaw(noise)
06_xx00xxxx_12A_0637	396.048	397.78	noise
06_xx00xxxx_12A_0923	536.244	538.512	noise
06_xx00xxxx_12A_1480	806.948	807.066	noise
06_xx00xxxx_12A_1486	809.368	813.59	noise
06_xx00xxxx_12A_1809	938.851	938.923	noise
06_xx00xxxx_12A_1942	992.232	992.431	noise
06_xx00xxxx_12A_2068	1037.665	1040.68	noise
06_xx00xxxx_12A_2092	1052.603	105

In [70]:
%%time
%%bash

####################
### FSC Analysis ###
####################

# The only salvageable non-fsc transcripts contain non-fsc: and can be recovered using the pattern below:
# (non-fsc:\([^(]*\))

cat data/FSC/*/*/*-trn.txt | awk -F'\t' '{if ($4 ~ /[^ A-Za-z0-9.,-?!"'\'']/) print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($4 !~ /fsc/) print $0}' OFS='\t' |\
    sed 's|\([^_]*_[^_]*_[^_]*\)_[^\t]*|playtrim ./ \1.wav|g'
#tr ' ' '\012' | sort --parallel=8 | sed '/^[ ]*$/d' | uniq -c

playtrim ./ 00_xx10xxxx_15A.wav	830.435	 830.592	 {voc noise}
playtrim ./ 00_xx10xxxx_15A.wav	979.260	 979.874	 basa (read)
playtrim ./ 00_xx10xxxx_15A.wav	987.380	 987.856	 basa (wet)
playtrim ./ 00_xx10xxxx_15A.wav	1452.810	 1453.072	 {voc noise}
playtrim ./ 00_xx10xxxx_15A.wav	1855.680	 1855.910	 {voc noise]}
playtrim ./ 01_xx01xxxx_14A.wav	0.870	 0.910	 {nvc noise}
playtrim ./ 01_xx01xxxx_14A.wav	117.720	 117.750	 {noise}
playtrim ./ 01_xx01xxxx_14A.wav	693.040	 693.591	 basa (read)
playtrim ./ 01_xx01xxxx_14A.wav	699.378	 699.820	 basa (wet)
playtrim ./ 01_xx01xxxx_14A.wav	825.620	 825.680	 {nvc noise}
playtrim ./ 01_xx01xxxx_14A.wav	1653.181	 1653.731	 {voc noise}
playtrim ./ 01_xx01xxxx_14A.wav	1658.465	 1658.554	 {voc noise}
playtrim ./ 01_xx01xxxx_14A.wav	1660.132	 1661.373	 {voc noise}
playtrim ./ 01_xx01xxxx_14A.wav	1666.941	 1667.384	 {voc noise}
playtrim ./ 02_xx00xxxx_11A.wav	118.58	 119.165	 (noise)
playtrim ./ 02_xx00xxxx_11A.wav	643.033	 643.340	 (noise)
playtrim ./ 02

CPU times: user 16 ms, sys: 4 ms, total: 20 ms
Wall time: 2.35 s


In [231]:
%%time
%%bash

#######################
### FSC Corrections ###
#######################

# This is a way to check all expected non-alphanumeric symbols to handle

cat data/FSC/*/*/*-trn.txt |\
    # Step 1: removed empty/corrupted segments, pure silence, non-fscs
    awk '{if (($4 !~ /^[ ]*$/) && ($4 !~ /^[ ]*\.+[ ]*$/) && ($4 !~ /fsc/)) print $0}' OFS='\t' |\
    # remove alphanumerics
    awk -F'\t' '{gsub(/[a-z0-9]/," ",$4); print $0;}' OFS='\t' |\
    # eliminate lines that doesn't have symbols
    awk -F'\t' '{if ($4 !~ /^[ ]*$/) print $0;}' OFS='\t' |\
    # remove safe pairs of brackets
    awk -F'\t' '{gsub(/\([ ]*\)/,"",$4); gsub(/{[ ]*}/,"",$4); gsub(/\[[ ]*\]/,"",$4); print $0}' OFS='\t' |\
    # round 2 of blank eliminations
    awk -F'\t' '{if ($4 !~ /^[ ]*$/) print $0;}' OFS='\t' |\
    # isolating transcriptions and finding out the patterns
    cut -f4- | tr ' ' '\012' | sort --parallel=8 | uniq | sed '/^[ ]*$/d'
    
#cut -f4- | sed 's/^ //g' | sed '/^\.\.$/d' | sed 's/[A-Za-z0-9]/ /g;' |\
#tr ' ' '\012' | sort --parallel=8 | sed '/^[ ]*$/d' | uniq -c

#cat data/FSC/*/*/*-trn.txt | cut -f4- | sed 's/^ //g' | sed '/^\.\.$/d' | sed 's/\([^A-Za-z0-9.]\)/ \1 /g' |\
#    sed 's/\([a-z][a-z]\)[.]\+/\1/g; s/\.\.\+//g' | tr ' ' '\012' | sort --parallel=8 | uniq \
#    > /storage07/user_data/angfederico01/local/g2p-seq2seq_OLD/FSC_vocab

!
!"
!?
"
'
+
,
,,
-
.
."
."..
..
...
....
?
?"
?.
?..
CPU times: user 4 ms, sys: 8 ms, total: 12 ms
Wall time: 2.09 s


In [147]:
%%time
%%bash

#######################
### FSC Corrections ###
#######################

# This is a way to check all expected non-alphanumeric symbols to handle

cat data/FSC/*/*/*-trn.txt |\
    # Step 1: removed empty/corrupted segments, pure silence, non-fscs
    awk '{if (($4 !~ /^[ ]*$/) && ($4 !~ /^[ ]*\.+[ ]*$/) && ($4 !~ /fsc/)) print $0}' OFS='\t' |\
    # remove alphanumerics
    awk -F'\t' '{gsub(/[a-z0-9]/," ",$4); print $0;}' OFS='\t' |\
    # eliminate lines that doesn't have symbols
    awk -F'\t' '{if ($4 !~ /^[ ]*$/) print $0;}' OFS='\t' |\
    # remove safe pairs of brackets
    awk -F'\t' '{gsub(/\([ ]*\)/,"",$4); gsub(/{[ ]*}/,"",$4); gsub(/\[[ ]*\]/,"",$4); print $0}' OFS='\t' |\
    # round 2 of blank eliminations
    awk -F'\t' '{if ($4 !~ /^[ ]*$/) print $0;}' OFS='\t' |\
    # look at leftover bracketings
    awk -F'\t' '{if ($4 ~ /[][{}()]/) print $0;}' OFS='\t'
#cut -f4- | sed 's/^ //g' | sed '/^\.\.$/d' | sed 's/[A-Za-z0-9]/ /g;' |\
#tr ' ' '\012' | sort --parallel=8 | sed '/^[ ]*$/d' | uniq -c

#cat data/FSC/*/*/*-trn.txt | cut -f4- | sed 's/^ //g' | sed '/^\.\.$/d' | sed 's/\([^A-Za-z0-9.]\)/ \1 /g' |\
#    sed 's/\([a-z][a-z]\)[.]\+/\1/g; s/\.\.\+//g' | tr ' ' '\012' | sort --parallel=8 | uniq \
#    > /storage07/user_data/angfederico01/local/g2p-seq2seq_OLD/FSC_vocab

00_xx10xxxx_15A_1568	1855.680	 1855.910	 {         ]}
25_xx10xxxx_14A_0003	16.934	 20.930	                      (               '      )                                             
73_xx10xxxx_14A_0110	157	 158.650	                (      :     ,      )
02_xx00xxxx_11B_1330	1099.628	 1100.888	 (  ...           )
09_xx00xxxx_15B_1486	1348.174	 1348.860	         (            }
87_xx10xxxx_13B_0088	106.750	 108.175	  )                 ?
87_xx10xxxx_13B_0090	110.125	 112	  )                  ,      ?
87_xx10xxxx_13B_0094	114.325	 116.275	  )                       ?
87_xx10xxxx_13B_0096	118	 119.5750	  )                      ?
87_xx10xxxx_13B_0098	121.625	 123.425	  )                      ?
87_xx10xxxx_13B_0100	124.825	 126.075	  )               .
87_xx10xxxx_13B_0102	127.725	 129.075	   )                  .
87_xx10xxxx_13B_0104	130.850	 132.45	   )                    
87_xx10xxxx_13B_0110	135.475	 136.825	   )                  .
87_xx10xxxx_13B_0112	138.575	 140.075	   )                   

In [None]:
%%time
%%bash

# DO NOT MIND THIS: FOR HISTORY PURPOSES ONLY
# found duplicated transcription files

. ./cmd.sh
. ./path.sh
set -e # exit on error

FSCDATATOP=data/raw/FSC
#Total Duration of 363 files: 141:16:24.01
#ls "$FSCDATATOP/Volume 6 (Transcriptions)/hand transcribed read speech/00_xx10xxxx_15A.trs"

#find $FSCDATATOP/* -iname "*.wav" | sed "s/^/'\''/g; s/$/'\''/g" | xargs -0 soxi
while IFS= read -r -d '' n; do
  printf '%q\n' "$n"
done < <(find $FSCDATATOP/Volume\ 6* -iname "[^cw]*.trs" -print0) | xargs -n1 basename | sort --parallel=8 | uniq | wc -l
#done < <(find $FSCDATATOP/Volume\ 6* -iname "*.trs" -print0) | xargs -n1 basename | sort --parallel=8 | uniq -d -c
#done < <(find $FSCDATATOP/Volume\ 6* -iname "[^cw]*.trs" -print0)

In [228]:
# JSON stuff from annotation
import json

def loadCorrections(corrections, jsonfile):
    with open(jsonfile) as json_file:
        data = json.load(json_file)

    for entries in data['audios']:
        corrections[entries['rel_path'].split('/')[-1]] = entries['norm_label']

    return corrections

corrections = {}
jsonfile1 = 'FSC_annotations.json'
corrections = loadCorrections(corrections, jsonfile1)

for entry in corrections:
    utt_id = '_'.join(entry.split('_')[:4])
    transc = corrections[entry].replace("'","'\\''")
    print("    awk -F'\\t' '{if ($1 ~ /"+utt_id+"/) $4=\""+transc+"\"; print $0;}\' OFS='\\t' |\\")

    awk -F'\t' '{if ($1 ~ /00_xx10xxxx_15B_0026/) $4="may luma ngang kasabihan"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /01_xx01xxxx_14A_0006/) $4="+misaligned+ ako po ay +misaligned+"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /01_xx01xxxx_14A_0010/) $4="+misaligned+ tatrabaho dito. dahil sa hirap ng +misaligned+"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /01_xx01xxxx_14A_0500/) $4=".."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /02_xx00xxxx_11B_0080/) $4="dalawampu'\''t tatlong taon."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /04_xx10xxxx_22A_0004/) $4="ako si lara. ako ay bente syete anyos."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /04_xx10xxxx_22A_0060/) $4="sigurado ka ba na petsa ika-labing tatlo ngayon?"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /04_xx10xxxx_22B_0032/) $4="+misaligned+ at sarili namin ang lahat ng sandali, ang lahat ng panahon +misaligned+"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /07_xx00

In [5]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
set -e # exit on error

FSCDATATOP=data/raw/FSC
rm -fr data/FSC

counter=0
while IFS= read -r -d '' n; do
  sample="$n"
  dbname=$(basename $FSCDATATOP)
  filname=$(basename "$sample" .trs)
  settype=${dbname}_$(echo $filname | grep -o '[^0-9]\+$')
  #echo $settype
  targetdir=data/FSC/$settype/$filname

  # make folder
  rm -fr 
  mkdir -p $targetdir
  wavfile=$(find $FSCDATATOP/* -iname "$filname.wav")
  if [ -z "$wavfile" ]; then
    # work around for files 68 and 69
    wavfile=$(find $FSCDATATOP/* -iname "${filname}AS.wav")
  fi
  if [ -z "$wavfile" ]; then
    echo "Missing:" $sample
    continue
  fi
  echo "$wavfile" > $targetdir/${filname}-wav.list
  
  # get time limit of file
  #echo "$wavfile"
  duration=`soxi "$wavfile" | grep -o '[0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.[0-9][0-9]' |\
      awk -F':' '{s=$3+60*$2+3600*$1; print s}'`

  cat "$sample" | awk '{if (($0 ~ /Sync/) || ($0 ~ /^[^<]/)) print $0;}' | sed 's/\r\?$/\t/g' | tr -d '\n' |\
    sed 's?\(<Sync time="[0-9.]*"/>\)?\1\n\1?g' |\
    awk -F'\t' '{if (NF == 3) printf "%s\t%s\t%s\n",$1,$3,tolower($2)}' |\
    sed 's?<Sync time="\([^"]*\)"/>?\1?g;' | awk -F'\t' -vlabel=$filname -vdura=$duration \
    '{if ($2 ~ /^$/) {$2=dura}; \
      if ($1 < dura && $2 <= dura) {a+=1; printf "%s_%04d\t%s\t%s\t%s\n",label,a,$1,$2,$3} }' |\
    # substitutions here
    sed 's/{voc noise]\?}/+garbage+/g' |\
    sed 's/{nvc\(\| noise\)}/+garbage+/g' |\
    sed 's/[{([]\?\(background \|\)noise[]})]\?/ +garbage+ /g' |\
    sed 's/\[laugh\]/ +laugh+ /g' |\
    sed 's/\[breath\]/ +breath+ /g' |\
    # if you put this before breath, you will lose the transcribed breaths
    sed 's/\[[^][]*\]/ +garbage+ /g' |\
    # the order of the following 2 is important!
    sed 's/(non-fsc:\(.*\))/\1/g' |\
    sed 's/(\?n.*fsc.*/ +garbage+ /g' |\
    sed 's/nonsfc/ +garbage+ /g' | sed 's/non-speech/ +garbage+ /g' |\
    sed 's/nonsc/ +garbage+ /g' | sed 's/nonfs/ +garbage+ /g' |\
    sed 's/nsfc/ +garbage+ /g' |\
    # handling parentheses
    awk -F'\t' '{if ($1 ~ /speech/) gsub(/\([^()]*\)/," +garbage+ ",$4); \
        else gsub(/\([^()]*\)/," ",$4); print $0;}' OFS='\t' |\
    # handling braces
    awk -F'\t' '{if ($1 !~ /speech/) gsub(/{[^{}]*}/," ",$4); print $0;}' OFS='\t' |\
    sed 's/{\([^{}]*\)}/ \1 /g' |\
    # CORNER CASES!!!!
    awk -F'\t' '{if ($1 ~ /00_xx10xxxx_15B_0026/) $4="may luma ngang kasabihan"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /01_xx01xxxx_14A_0006/) $4="+misaligned+ ako po ay +misaligned+"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /01_xx01xxxx_14A_0010/) $4="+misaligned+ tatrabaho dito. dahil sa hirap ng +misaligned+"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /01_xx01xxxx_14A_0500/) $4=".."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /02_xx00xxxx_11B_0080/) $4="dalawampu'\''t tatlong taon."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /04_xx10xxxx_22A_0004/) $4="ako si lara. ako ay bente syete anyos."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /04_xx10xxxx_22A_0060/) $4="sigurado ka ba na petsa ika-labing tatlo ngayon?"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /04_xx10xxxx_22B_0032/) $4="+misaligned+ at sarili namin ang lahat ng sandali, ang lahat ng panahon +misaligned+"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /07_xx00xxxx_13A_0004/) $4="bente at dalaga."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /07_xx00xxxx_13A_0010/) $4="matapos ang tatlong buwan"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /07_xx00xxxx_13A_0082/) $4="trese"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /08_xx00xxxx_14A_0002/) $4="ako po ay bente sais anyos at apat na taon nang nagtatrabaho dito."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /09_xx00xxxx_15A_0725/) $4=".."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /09_xx00xxxx_15B_1486/) $4="unahan"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /100_xx00xxxx_11B_0188/) $4="sisimulan sa g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /101_xx10xxxx_12A_0124/) $4="sixty seven kilo ng shabu"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /102_xx00xxxx_13speech_0132/) $4=".."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /105_xx00xxxx_11B_0114/) $4="simulan sa g.m.a. seven ang t.v. show na daboy and da girl"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /105_xx00xxxx_11B_2449/) $4="g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /105_xx00xxxx_11speech_0071/) $4="+garbage+"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /10_xx00xxxx_12A_0006/) $4="ako ay beinte syente anyos."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /10_xx00xxxx_12A_1010/) $4="sawa"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /111_xx00xxxx_12speech_0004/) $4="mas ok ngayon yung pinatupad na c.w.t.s. one at c.w.t.s. two kapalit ng r.o.t.c. na apat na sems"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /111_xx00xxxx_12speech_0011/) $4="yung isang sem na sa c.w.t.s. one na ino-offer ng eng'\''g sa tingin ko mas marami kang matututunan don kasi kami"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /112_xx00xxxx_13speech_0058/) $4="sa western visayas sa region six"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /113_xx10xxxx_14speech_0018/) $4="ayan. tri. ganyan yung"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /115_xx10xxxx_11speech_0068/) $4="yung dalawang series na napapalabas sa g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /129_xx00xxxx_15speech_0187/) $4="l.c. one"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /129_xx00xxxx_15speech_0249/) $4="co e. one an one"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /129_xx00xxxx_15speech_0287/) $4="co e. one two five"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /129_xx00xxxx_15speech_0436/) $4="o pagkatapos mong magreport sa one ninety kasi parang"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /12_xx10xxxx_14A_0002/) $4="ako po ay dalwampu'\''t anim anyos"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /131_xx10xxxx_12speech_0149/) $4="co e. one two one yon"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /140_xx10xxxx_11speech_0028/) $4="sa bahay ng ate ko gumawa ako ng m.p. ko para sa c.s. twelve"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /16_xx10xxxx_12A_1534/) $4=".."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /19_xx00xxxx_14B_1192/) $4=".."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /30_xx00xxxx_11B_0123/) $4="sisimulan sa g.m.a. seven ang t.v. show na daboy en da girl"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /30_xx00xxxx_11B_2926/) $4="g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /34_xx10xxxx_12A_3351/) $4="animnapu at pito"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /36_xx00xxxx_15B_0029/) $4="may luma ngang kasabihan"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /37_xx00xxxx_11B_0145/) $4="sisimulan sa g.m.a. seven ang t.v. show na daboy en da girl"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /37_xx00xxxx_11B_2954/) $4="g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /39_xx10xxxx_12A_0096/) $4="animnapu'\''t pitong kilo ng shabu"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /39_xx10xxxx_12B_0040/) $4="ang kalawakan sarili namin ang nalalanghap na hangin"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /40_xx10xxxx_15A_0043/) $4="siya ay mag-"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /40_xx10xxxx_15A_1539/) $4="kanila"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /46_xx00xxxx_12A_0010/) $4="ako ay dalawampu'\''t pitong anyos."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /46_xx00xxxx_12A_0079/) $4="animnapu'\''t pitong kilo ng shabu ang gagamiting ebidensiya ng p.n.p. laban sa naarestong pusher sa kolehiyo sa san jose del monte."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /46_xx00xxxx_12A_3019/) $4="animnapu'\''t pito"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /47_xx10xxxx_14A_0619/) $4="raho"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /47_xx10xxxx_24speech_0009/) $4="pagkatapos ng e.s. twelve class ko"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /47_xx10xxxx_24speech_0011/) $4="habang nagpapa-xerox ako ng sample exams para sa e.s. twelve fourth exam"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /47_xx10xxxx_24speech_0082/) $4="malapit na ang exam ko sa thirty five"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /47_xx10xxxx_24speech_0093/) $4="sa e.s. twelve naman"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /47_xx10xxxx_24speech_0095/) $4="sana pumasa. dahil ayoko naman siyang ulitin gayun din ang triple e. forty three"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /48_xx10xxxx_13A_0060/) $4="eh, bakit kailangan sa ika-labing tatlo ng pebrero?"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /51_xx00xxxx_11B_0134/) $4="sisimulan sag g.m.a. seven ang t.v. show na da boy en da girl"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /51_xx00xxxx_11B_2906/) $4="g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /52_xx10xxxx_14B_1878/) $4="lang"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /53_xx10xxxx_11B_0135/) $4="seven ang t.v. show ni daboy en da girl"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /53_xx10xxxx_11B_2891/) $4="g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /53_xx10xxxx_11speech_0088/) $4="c.s. eleven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /54_xx10xxxx_12A_0093/) $4="animnapu'\''t pito kilo ng shabu ang gagamiting ebidensiya ng"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /55_xx10xxxx_14A_0024/) $4="paano ninyo masisigurado na makikita ninyo siya rito?"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /60_xx00xxxx_11B_0241/) $4="sisimulan sa g.m.a. seven ang t.v. show"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /60_xx00xxxx_11B_3077/) $4="g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /66_xx10xxxx_13B_0459/) $4=".."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /69_xx00xxxx_11PB_0062/) $4="sinimulan sa g.m.a. seven ang t.v. show na"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /70_xx00xxxx_11B_0142/) $4="sisimulan sa g.m.a. seven ang t.v. show na daboy en da girl"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /70_xx00xxxx_11B_2881/) $4="g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /70_xx00xxxx_11speech_0009/) $4="twenty years old ako ngayon"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /71_xx00xxxx_12A_0084/) $4="sixty seven kilo ng shabu ang gagamiting ebidensiya ng p.n.p."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /71_xx00xxxx_12speech_0166/) $4=".."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /73_xx10xxxx_14speech_0008/) $4="galing ako sa cagayan de oro city ang address namin ay block twenty nine lot thirty six"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /73_xx10xxxx_14speech_0018/) $4="ah ipinanganak ako nuong november twenty one nineteen eighty one"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /73_xx10xxxx_14speech_0105/) $4="sa april twenty five"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /73_xx10xxxx_14speech_0158/) $4="tapos nagstey ako don ng mga two years"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /76_xx00xxxxl_12speech_0058/) $4="marami na ring ibang nakilala na malalayo din yung mga bahay kung minsan naka nakakabisita"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /76_xx00xxxxl_12speech_0210/) $4="sabay-sabay pang kumain sabay-sabay pumunta sa"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /77_xx00xxxx_13speech_0139/) $4="so medyo nagkaroon ako shak dito culture shak"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /77_xx00xxxx_13speech_0187/) $4="sa triple e. one ninety"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /77_xx00xxxx_13speech_0189/) $4="saka sa mga machine problems sa co e. one one five"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /77_xx00xxxx_13speech_0197/) $4="one ninety proposal ko"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /80_xx00xxxx_11B_0126/) $4="sisimulan sa g.m.a. seven ang t.v. show na daboy en da girl."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /80_xx00xxxx_11B_2834/) $4="g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /82_xx10xxxx_13speech_0133/) $4="one and a half year lang yata ako kasi sumablay na ako eh ang dami ko nang bagsak"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /85_xx00xxxx_11speech_0069/) $4="mga desensyo kaya yung sa amin nung time na kinuha ko yung co e. one four one"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /86_xx10xxxx_12speech_0017/) $4="tapos yung mga kasama ko talaga sa la sa buong araw na iyon ay si glaiza at si ate gigi yung mga ah c. fourteen"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0088/) $4="saan ka nakatira?"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0090/) $4="mali ang magnakaw di ba?"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0094/) $4="sino kasama mo lumabas?"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0096/) $4="kelan ang bertdey mo?"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0098/) $4="kukuha ka ba ng eksam?"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0100/) $4="bumaba ka rito."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0102/) $4="palayasin mo siya."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0104/) $4="pakiluto itong isda"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0110/) $4="magandang araw po."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0112/) $4="maligayang kaarawan!"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0114/) $4="iwan mo na ang mga withdrawals."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0116/) $4="ang mga zebra ay lumalakad sa apat na paa."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0118/) $4="ang unggoy ay nakita noong friday."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0120/) $4="siya ay isang imahe ng dalagang pilipina"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0122/) $4="isuot mo ang tsinelas."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0132/) $4="siya ay tumutugtog ng violin"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13B_0136/) $4="jogging ang mabisang exercise."; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /87_xx10xxxx_13speech_0027/) $4="para mag +garbage+ ah maghanda para sa klase niyo na naman sa hapon"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /90_xx00xxxx_11B_0164/) $4="sisimulan sa g.m.a. seven ang t.v. show na daboy en da girl"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /90_xx00xxxx_11B_2868/) $4="g.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /90_xx00xxxx_11speech_0049/) $4="ang aking room ay room one two seven room mate ko sina jameson joy at benedict. dati rin akong taga kalay nung ako ay first year kung baga. last year yon room one one nine naman ako don at ang room one one nine sa narra ay room ni dif saka ni gift"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /95_xx00xxxx_11B_0164/) $4="sisimulan sa g.m.a. seven ang t.v. show ni"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /95_xx00xxxx_11B_2930/) $4="j.m.a. seven"; print $0;}' OFS='\t' |\
    awk -F'\t' '{if ($1 ~ /99_xx00xxxx_15B_1385/) $4="yan"; print $0;}' OFS='\t' |\
    # isolate symbols
    sed 's/[!",:?]/ & /g;' | sed 's/\([a-z][a-z]\)\./\1 ./g' |\
    # ensure nice spacing
    awk -F'\t' '{gsub(/^[ ]*/,"",$4); gsub(/[ ]+/," ",$4); print $0;}' OFS='\t' | sed 's/[ ]*$//g' \
    > $targetdir/${filname}-trn.txt

  counter=$((counter + 1))
#done < <(find $FSCDATATOP/Volume\ 6* -iname "01_xx01xxxx_14A.trs" -print0)
done < <(find $FSCDATATOP/Volume\ 6* -iname "[^cw]*.trs" -print0)
echo $counter

Missing: data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed read speech/11_xx10xxxx_11A.trs
Missing: data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed read speech/11_xx10xxxx_11B.trs
Missing: data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed read speech/49_xx10xxxx_14A.trs
Missing: data/raw/FSC/Volume 6 (Transcriptions)/hand transcribed read speech/49_xx10xxxx_14B.trs
Missing: data/raw/FSC/Volume 6 (Transcriptions)/machine pre-segmented transcribed read speech/66_xx10xxxx_13PB.trs
293
CPU times: user 4 ms, sys: 849 µs, total: 4.85 ms
Wall time: 19.8 s


In [6]:
%%bash

. ./cmd.sh
. ./path.sh
set -e # exit on error

FSCEXTRACTED=data/FSC

#Total Duration of 290 files: 120:28:47.60
#ls "$FSCDATATOP/Volume 6 (Transcriptions)/hand transcribed read speech/00_xx10xxxx_15A.trs"

#find $FSCEXTRACTED/* -iname "*-wav.list"
while read line; do
  cat $line | sed 's/ /\\ /g'
done < <(find $FSCEXTRACTED/* -iname "*-wav.list") |\
    # need to remove misaligned transcriptions
    # Total Duration of 287 files: 119:18:32.73
    grep -v '01_xx01xxxx_14A' | grep -v '04_xx10xxxx_22B' | grep -v '12_xx10xxxx_14B' |\
    sort --parallel=8 | uniq | xargs soxi | tail -n1

Total Duration of 287 files: 119:18:32.73


### ISIP

In [237]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
set -e # exit on error

ISIPTGLDATATOP=data/raw/ISIP_TGL

#Total duration of 43019 files: 129607.53 sec -- 36:0:7.53

#find $FSCDATATOP/* -iname "*.wav" | sed "s/^/'\''/g; s/$/'\''/g" | xargs -0 soxi
counter=$(find $ISIPTGLDATATOP/* -iname "*.wav" | wc -l)
while IFS= read -r -d '' n; do
  printf '%q\n' "$n" | sed 's/\\303\\261/ñ/g' | sed 's/\$//g'
done < <(find $ISIPTGLDATATOP/* -iname "*.wav" -print0) | xargs soxi -D |\
awk -vcounter=$counter '{s+=$0;} END {h=int(s/3600); r=(s-h*3600); m=int(r/60); r-=m*60; printf "Total duration of %d files: %.2f sec -- %d:%d:%.2f\n", counter, s, h, m, r;}'

Total duration of 43019 files: 129607.53 sec -- 36:0:7.53
CPU times: user 20 ms, sys: 12 ms, total: 32 ms
Wall time: 8min 42s


In [7]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
set -e # exit on error


ISIPTGLDATATOP=data/raw/ISIP_TGL
rm -fr data/ISIP_TGL

while IFS= read -r -d '' n; do
  sample="$n"
  filname=$(basename "$sample" .log)
  
  while read -r line settype trans; do
    duration=`soxi "$(dirname "$sample")/$line" | grep -o '[0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.[0-9][0-9]' |\
      awk -F':' '{s=$3+60*$2+3600*$1; print s}'`
    
    targetdir=data/ISIP_TGL/$settype/$filname
    
    # make folder
    mkdir -p $targetdir
    echo "$(dirname "$sample")/$line" >> $targetdir/${filname}-wav.list
    
    # remove invisible symbols
    trans=$(echo "$trans" | sed 's/\xc2\x96//g' | sed 's/\xc2\xa0//g' | sed 's/\xef\xbb\xbf//g' | sed 's/\x09/ /g')
    
    # normalize acronyms for better g2p
    trans=$(echo "$trans" |\
        sed "$(echo `cat ISIP_ACRONYMS_REPLACEMENTS | sed 's/\([^:]*\):\([^:]*\)/s|\\\\<\1\\\\>|\2|g;/g' | tr '\012' ' '`)")
    
    # removal of single-quoted expressions
    trans=$(echo "$trans" | sed "s/'\([^']*\)'\([ .]\|$\)/\1\2/g")
    
    # separate symbols so they can be mapped to silence
    trans=$(echo "$trans" | sed 's/[!",`|~:;?\[*]/ & /g;' | sed 's/^[ ]\+//g' | sed 's/\r//g')
    
    # for minimum pairs recordings
    if [ "$settype" = "Iso_MinPairs" ]; then
        trans=$(echo "$trans" | sed 's/-/ <sp> /g' | sed 's/([^)]*$//g')
    fi
    
    # expand numerals
    if [ "$settype" = "Iso_Random_Digit" ]; then
        trans=$(echo "$trans" | sed "s/100/isang daan/g; s/10/sampu/g; s/17/labimpito/g;\
                s/1$/isa/g; s/2$/dalawa/g; s/3$/tatlo/g; s/4$/apat/g; s/5$/lima/g; \
                s/6$/anim/g; s/7$/pito/g; s/8$/walo/g; s/9$/siyam/g; \
                s/^1/labing /g; s/^2/dalawampu't /g; s/^3/tatlumpu't /g; s/^4/apatnapu't /g; \
                s/^5/limampu't /g; s/^6/animnapu't /g; s/^7/pitumpu't /g; s/^8/walumpu't /g; \
                s/^9/siyamnapu't /g" | sed "s/'t 0//g" )
    fi
    
    # specific reading of numeral
    if [ "$filname" = "0812.110816.021250" ] || [ "$filname" = "4281.110817.051434" ] || [ "$filname" = "8125.110816.100658" ];
    then
        trans=$(echo "$trans" | sed "s/Taung 1895/taong eighteen ninety five/g")
    fi
    if [ "$filname" = "5093.110818.065955" ] || [ "$filname" = "5968.110824.005315" ];
    then
        trans=$(echo "$trans" | sed "s/Taung 1895/taong isang libo walong daan siyamnapu't lima/g")
    fi
    if [ "$filname" = "5984.110823.083245" ] || [ "$filname" = "9640.110818.005729" ];
    then
        trans=$(echo "$trans" | sed "s/Taung 1895/taong isang libo walong daan at siyamnapu't lima/g")
    fi
    
    echo | awk -v a="$(basename "$line" .wav)" -v b=$duration -v c="$trans" \
        '{print a"\t0.0\t"b"\t"tolower(c);}' |\
        sed 's/\xc3\xa0/a/g; s/\xc3\xa1/a/g; s/\xc3\xa2/a/g' |\
        sed 's/\xc3\xa9/e/g' |\
        sed 's/\xc3\xac/i/g; s/\xc3\xad/i/g; s/\xc3\xae/i/g;' |\
        sed 's/\xc3\xb2/o/g; s/\xc3\xb3/o/g; s/\xc3\xb4/o/g;' |\
        sed 's/\xc3\xba/u/g;' |\
        sed "s/\xe2\x80\x99/'/g" | sed 's/&/ and /g' | sed 's/\xc3\xbf//g' |\
        sed 's|/| <sp> |g' | sed 's/[ \t](.*)$//g' | sed 's/([^)]*)//g' | sed 's/mr\./mister/g' |\
        sed 's/\([a-z][a-z]\)\./\1 ./g' | sed 's/ [ ]*/ /g;' | sed 's/[ ]\+$//g;' |\
        sed 's/\. <sp>/<sp>/g' | sed 's/ \.[ ]\?$//g' | sed "s/'s\.$/'s/g" | sed 's/\.[.]\+/./g' |\
        sed 's/el nino/el niño/g; s/los ninos/los niños/g; s/la nina/la niña/g' | sed 's/_/ /g' |\
        sed 's/mo a\.$/mo ah/g' \
        >> $targetdir/${filname}-trn.txt
  done < <(grep '\.wav' "$sample" | sed 's/Random Digit/Iso_Random_Digit/g; s/\.txt//g; s/TGL_//g; s/"//g')

done < <(find $ISIPTGLDATATOP/* -iname "*.log" -print0)
#done < <(find $ISIPTGLDATATOP/* -iname "*0312.110923.052230.log" -print0)

CPU times: user 26.2 ms, sys: 6.28 ms, total: 32.5 ms
Wall time: 20min 22s


In [None]:
/*
!$&'(),-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz àáâéìíîñòóôúÿ’﻿
*/


In [None]:
%%time
%%bash
awk -F'\t' '{printf "%s  %s\n",$1,$4}' data/KIT_TGL/*/*/*-trn.txt

In [None]:
%%time
%%bash

########################
### ISIP Corrections ###
########################

cat data/ISIP_TGL/*/*/*-trn.txt | cut -f4- | sed 's/^ //g' | sed '/^\.\.$/d' | sed 's/[A-Za-z0-9]/ /g;' |\
tr ' ' '\012' | sort --parallel=8 | sed '/^[ ]*$/d' | uniq -c

In [None]:
%%time
%%bash
cat -v SYMBOLS | paste - SYMBOLS

In [None]:
%%time
%%bash

cut -d' ' -f3- data/train/text | tr ' ' '\012' | LC_ALL=C sort --parallel=8 | uniq | sed 's/[a-z]//g' |\
sed '/^$/d' | sort --parallel=8 | uniq

In [None]:
%%time
%%bash

cut -d' ' -f3- data/train/text | tr ' ' '\012' | LC_ALL=C sort --parallel=8 | uniq |\
sed 's/[a-z]//g' | sed '/^$/d' | sort --parallel=8 | uniq

In [None]:
%%bash
#cut -d' ' -f3- data/train/text | 
#cat data/train/text | grep '\.\.[a-z]'
cut -d' ' -f3- data/train/text  | tr ' ' '\012' | LC_ALL=C sort --parallel=8 | uniq > /storage05/user_data/angfederico01/local/g2p-seq2seq/FSC_vocab

In [8]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
set -e # exit on error

#Total Duration of 43019 files: 36:00:07.20
KITGLDATATOP=data/raw/KIT_TGL
rm -fr data/KIT_TGL

while read -r name wavfile start end trans; do
    # make folder
    targetdir=data/KIT_TGL/$name/$wavfile
    mkdir -p $targetdir
    echo $KITGLDATATOP/$name/$wavfile".wav" >> $targetdir/${wavfile}-wav.list
    
    duration=`soxi $KITGLDATATOP/$name"/"$wavfile".wav" | grep -o '[0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.[0-9][0-9]' |\
          awk -F':' '{s=$3+60*$2+3600*$1; print s}'`
    
    if (( $(echo "$end > 0" | bc) )); then
        duration=$end
    fi
    echo | awk -v a="$name"."$wavfile" -v b=$start -v c=$duration -v d="$trans" \
      '{printf "%s.%06.0f_%06.0f\t%s\t%s\t%s\n",a,int(1000*b),int(1000*c),b,c,tolower(d)}' \
        >> $targetdir/${wavfile}-trn.txt

done < <(cat $KITGLDATATOP/TEXT_DATA/transcripts)

CPU times: user 13.9 ms, sys: 1.16 ms, total: 15.1 ms
Wall time: 8min 25s


In [None]:
printf "%s_%04d\t%s\t%s\t%s\n",label,a,$1,$2,$3

In [6]:
%%bash

ls data/ISIP_TGL/TGLNEW_News01/*

data/ISIP_TGL/TGLNEW_News01/2578.120201.092932:
2578.120201.092932-trn.txt
2578.120201.092932-wav.list

data/ISIP_TGL/TGLNEW_News01/8375.120201.080928:
8375.120201.080928-trn.txt
8375.120201.080928-wav.list

data/ISIP_TGL/TGLNEW_News01/9687.120201.050119:
9687.120201.050119-trn.txt
9687.120201.050119-wav.list
