# Using syllabifyARPA

In [1]:
from syllabifyARPA import syllabifyARPA

## With Python strings and arrays

In [2]:
ARPAstring = 'HH AE NG M AE N'
ARPAarray = ARPAstring.split()

In [3]:
print(syllabifyARPA(ARPAstring))
print(syllabifyARPA(ARPAarray)) # Same result with a string or phone array

0    HH AE NG
1      M AE N
dtype: object
0    HH AE NG
1      M AE N
dtype: object


In [4]:
# Use return_list parameter to change return type (Pandas Series by default)
syllabifyARPA(ARPAarray, return_list=True)

['HH AE NG', 'M AE N']

In [5]:
unsyllabifiable = 'M G L AA'
syllabifyARPA(unsyllabifiable)



Series([], dtype: float64)

## With Pandas Series and DataFrames

### Importing CMU dictionary data from the text file

In [6]:
import pandas as pd
from syllabifyARPA import syllabifyARPA

df = pd.read_csv('cmudict.txt', delimiter='\n', header=None, quoting=3, comment='#', names=['dict'])
#df = pd.read_csv('cmusubset.txt', delimiter='\n', header=None, names=['dict']) # For quick testing
df.head()

Unnamed: 0,dict
0,!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH A...
1,"""CLOSE-QUOTE K L OW1 Z K W OW1 T"
2,"""DOUBLE-QUOTE D AH1 B AH0 L K W OW1 T"
3,"""END-OF-QUOTE EH1 N D AH0 V K W OW1 T"
4,"""END-QUOTE EH1 N D K W OW1 T"


### Preprocessing data

In [7]:
# Removing all rows containing non-alphanumeric characters and spaces 
df = df[df['dict'].str.contains(r'[^A-Z0-2 ]') == False]
df = df['dict'].str.extract(r'(?P<word>\w+) (?P<transcription>.+)', expand=True)
# Splitting into an array of phonemes described in phoneset.txt
df['transcription'] = df['transcription'].str.split()
df.head()

Unnamed: 0,word,transcription
38,A,[AH0]
45,AABERG,"[AA1, B, ER0, G]"
46,AACHEN,"[AA1, K, AH0, N]"
47,AAKER,"[AA1, K, ER0]"
48,AALSETH,"[AA1, L, S, EH0, TH]"


### Applying syllabifyARPA and manipulating the returned DataFrame

In [8]:
syllables = df['transcription'].apply(syllabifyARPA, silence_warnings=True)#.dropna(thresh=1) # Drops unsyllabifiable rows
syllables.head()

ERROR:root:Impossible to syllabify AE1 B T S according to English syllabification rules.
ERROR:root:Impossible to syllabify EY1 N Y according to English syllabification rules.
ERROR:root:Impossible to syllabify EY1 TH N according to English syllabification rules.
ERROR:root:Impossible to syllabify AA1 L D Y S according to English syllabification rules.
ERROR:root:Impossible to syllabify AE1 L R D according to English syllabification rules.
ERROR:root:Impossible to syllabify AO1 L R D according to English syllabification rules.
ERROR:root:Impossible to syllabify AE1 M K according to English syllabification rules.
ERROR:root:Impossible to syllabify AE1 NG SH T AE0 T according to English syllabification rules.
ERROR:root:Impossible to syllabify AE1 N DH Z according to English syllabification rules.
ERROR:root:Impossible to syllabify AA1 R L T according to English syllabification rules.
ERROR:root:Impossible to syllabify AA1 R N S T according to English syllabification rules.
ERROR:root:Im

ERROR:root:Impossible to syllabify JH ER1 D T S according to English syllabification rules.
ERROR:root:Impossible to syllabify G EH1 SH K according to English syllabification rules.
ERROR:root:Impossible to syllabify N AA1 R L D according to English syllabification rules.
ERROR:root:Impossible to syllabify G OW1 T CH according to English syllabification rules.
ERROR:root:Impossible to syllabify G OW1 T Z according to English syllabification rules.
ERROR:root:Impossible to syllabify G AA1 T CH according to English syllabification rules.
ERROR:root:Impossible to syllabify G R IY1 N HH AH2 L G according to English syllabification rules.
ERROR:root:Impossible to syllabify G R IY1 W according to English syllabification rules.
ERROR:root:Impossible to syllabify HH AA1 Y D UW0 according to English syllabification rules.
ERROR:root:Impossible to syllabify HH AE1 SH K according to English syllabification rules.
ERROR:root:Impossible to syllabify HH AY1 D T according to English syllabification r

ERROR:root:Impossible to syllabify M AA1 R DH according to English syllabification rules.
ERROR:root:Impossible to syllabify M AE1 SH K according to English syllabification rules.
ERROR:root:Impossible to syllabify M AE1 T DH Z according to English syllabification rules.
ERROR:root:Impossible to syllabify M AH0 T Y UW1 L IH0 HH according to English syllabification rules.
ERROR:root:Impossible to syllabify M AE1 T Z K according to English syllabification rules.
ERROR:root:Impossible to syllabify M EH1 D V D according to English syllabification rules.
ERROR:root:Impossible to syllabify M IY1 T Z according to English syllabification rules.
ERROR:root:Impossible to syllabify M EH1 N SH according to English syllabification rules.
ERROR:root:Impossible to syllabify M EH1 SH K according to English syllabification rules.
ERROR:root:Impossible to syllabify M EH1 T Z according to English syllabification rules.
ERROR:root:Impossible to syllabify M IH0 K Y UW1 L IH0 HH according to English syllabi

ERROR:root:Impossible to syllabify S M OW0 L EH1 N S K according to English syllabification rules.
ERROR:root:Impossible to syllabify S AA1 B IY0 HH according to English syllabification rules.
ERROR:root:Impossible to syllabify S OW1 Y K AH0 according to English syllabification rules.
ERROR:root:Impossible to syllabify S P R IY1 HH according to English syllabification rules.
ERROR:root:Impossible to syllabify S T AE1 D T according to English syllabification rules.
ERROR:root:Impossible to syllabify S T AE1 N P HH IH2 L according to English syllabification rules.
ERROR:root:Impossible to syllabify S T AO1 D T according to English syllabification rules.
ERROR:root:Impossible to syllabify S T AH1 D T according to English syllabification rules.
ERROR:root:Impossible to syllabify S T R OW1 Y N IY0 according to English syllabification rules.
ERROR:root:Impossible to syllabify S T AH1 D T according to English syllabification rules.
ERROR:root:Impossible to syllabify S T AH1 M P F according to

Unnamed: 0,0,1,2,3,4,5,6,7,8
38,AH0,,,,,,,,
45,AA1,B ER0 G,,,,,,,
46,AA1,K AH0 N,,,,,,,
47,AA1,K ER0,,,,,,,
48,AA1 L,S EH0 TH,,,,,,,


In [9]:
df = pd.concat([df, syllables], axis=1)
melted_df = df.melt(id_vars=['word', 'transcription'], value_name='syllable', var_name='position')
melted_df.dropna(inplace=True) # Drops all NaN positions, e.g., syllables 3+ in a disyllabic word

In [10]:
melted_df.sort_values('word', inplace=True)
melted_df.head()

Unnamed: 0,word,transcription,position,syllable
0,A,[AH0],0,AH0
102287,AABERG,"[AA1, B, ER0, G]",1,B ER0 G
1,AABERG,"[AA1, B, ER0, G]",0,AA1
2,AACHEN,"[AA1, K, AH0, N]",0,AA1
102288,AACHEN,"[AA1, K, AH0, N]",1,K AH0 N
