# Testing syllabifyARPA

### Importing CMU dictionary data from the text file

In [1]:
import pandas as pd
from syllabifyARPA import syllabifyARPA

df = pd.read_csv('cmudict.txt', delimiter='\n', header=None, quoting=3, comment='#', names=['dict'])
#df = pd.read_csv('cmusubset.txt', delimiter='\n', header=None, names=['dict']) # For quick testing
df.head()

Unnamed: 0,dict
0,!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH A...
1,"""CLOSE-QUOTE K L OW1 Z K W OW1 T"
2,"""DOUBLE-QUOTE D AH1 B AH0 L K W OW1 T"
3,"""END-OF-QUOTE EH1 N D AH0 V K W OW1 T"
4,"""END-QUOTE EH1 N D K W OW1 T"


### Preprocessing data

In [2]:
# Removing all rows containing non-alphanumeric characters and spaces 
df = df[df['dict'].str.contains(r'[^A-Z0-2 ]') == False]
df = df['dict'].str.extract(r'(?P<word>\w+) (?P<transcription>.+)', expand=True)
# Splitting into an array of phonemes described in phoneset.txt
df['transcription'] = df['transcription'].str.split()
df.head()

Unnamed: 0,word,transcription
38,A,[AH0]
45,AABERG,"[AA1, B, ER0, G]"
46,AACHEN,"[AA1, K, AH0, N]"
47,AAKER,"[AA1, K, ER0]"
48,AALSETH,"[AA1, L, S, EH0, TH]"


### Applying syllabifyARPA and manipulating the returned DataFrame

In [3]:
syllables = df['transcription'].apply(syllabifyARPA)#.dropna(thresh=1) # Drops unsyllabifiable rows
syllables.head()

['B', 'T', 'S']
['D', 'S']
['N', 'Y']
['TH', 'N']
['L', 'R', 'D']
['L', 'R', 'D']
['M', 'K']
['D', 'S', 'T']
['NG', 'S', 'T']
['NG', 'SH']
['N', 'DH', 'Z']
['R', 'L', 'T']
['R', 'Z', 'T']
['S', 'TH']
['HH']
['N', 'K']
['T', 'P']
['R', 'SH', 'CH']
['HH']
['TH', 'K']
['NG', 'T']
['N', 'SH']
['TH', 'K']
['SH', 'K']
['B', 'S', 'T']
['R', 'D', 'T']
['R', 'W', 'N']
['R', 'ZH']
['R', 'ZH']
['W']
['W']
['K', 'S', 'K', 'S']
['HH']
['D', 'T']
['T', 'Z', 'K']
['B', 'S', 'T']
['D', 'T']
['R', 'L', 'S']
['Y']
['Y']
['T', 'R']
['R', 'ZH']
['M', 'P', 'CH']
['R', 'D', 'T', 'S']
['Z', 'M']
['N', 'SH']
['N', 'SH']
['N', 'K']
['L', 'R']
['T', 'L']
['T', 'Z']
['L', 'G']
['P', 'K']
['M', 'K']
['M', 'T']
['HH']
['W']
['L', 'G']
['M', 'CH']
['M', 'JH']
['R', 'L', 'D']
['N', 'SH']
['N', 'W']
['Z', 'M']
['P', 'F']
['L', 'S', 'K']
['N', 'S', 'K']
['G', 'T']
['Y']
['K', 'S', 'K', 'S']
['T', 'S', 'K']
['D', 'T']
['T', 'Z']
['T', 'Z', 'SH']
['N', 'S', 'K']
['R', 'S', 'K']
['T', 'Z', 'K']
['N', 'S', 'K']
['N', 'S',

(101850, 9)

In [4]:
df = pd.concat([df, syllables], axis=1)
melted_df = df.melt(id_vars=['word', 'transcription'], value_name='syllable', var_name='position')
melted_df.dropna(inplace=True) # Drops all NaN positions, e.g., syllables 3+ in a disyllabic word

In [5]:
melted_df.sort_values('word', inplace=True)
melted_df.head()

Unnamed: 0,word,transcription,position,syllable
0,A,[AH0],0,AH0
102287,AABERG,"[AA1, B, ER0, G]",1,B ER0 G
1,AABERG,"[AA1, B, ER0, G]",0,AA1
2,AACHEN,"[AA1, K, AH0, N]",0,AA1
102288,AACHEN,"[AA1, K, AH0, N]",1,K AH0 N
