## Helper Functions

#### Syllable Count (adjusts NLTK tokenizer for the *magic "e"* rule)

In [None]:
SSP = SyllableTokenizer()
def magic_e(word):
    
    result = SSP.tokenize(word)
    syll_count = len(result)
    
    if syll_count == 1:
        return syll_count
    
    if re.search('e$', result[len(result) - 1]):
        modified = ''.join([result[i] for i in [len(result) - 2, len(result) - 1]])
        result[len(result) - 2] = modified
        del result[len(result) - 1]
        syll_count = len(result)
        
    return syll_count

#### POS tagger

In [None]:
def get_POS(row):
    
    retList = []
    
    for tag in nltk.pos_tag(row):
        retList.append(tag[1])
    
    return retList

#### Get IPA translations (using provided translations from https://github.com/open-dict-data/ipa-dict)

In [None]:
df = pd.read_csv('phoneticDictionary.csv')
df = pd.DataFrame(list(zip(df['word'], df['phon'])), columns=['word', 'ipa'])

def ipa(section):
    total_words = 0
    ipa_word = list(df['word'])
    ipa_notation = list(df['ipa'])
    ipa_dict = dict(zip(ipa_word, ipa_notation))
    new_sent = []
    for row in section.text:
        sent = []
        words = 0
        for word in row:
            total_words += 1
            words += 1
            if word in ipa_dict.keys():
                this_word = ipa_dict[word].replace("ˈ", "")
                this_word = this_word.replace("ˌ", "")
                sent.append(this_word)
            elif word in punctuation:
                sent.append(word)
            else:
                sent.append(' ')
        new_sent.append(sent)
    return new_sent

## Letter-Name Alphabetic Stage Functions

#### Check for words with consonant-vowel-consonant short vowel pattern

In [None]:
def check_CVC_short(dataset):
    
    CVC_short = []
    for row in tqdm(dataset['ipa']):
        cvc = 0
        total_words = 0
        for word in row:
            total_words += 1
            if re.search('^[btkzɹsjmfgndɫwpθvhʃð][btkzɹsjmfgndɫwpθvhʃðʒŋ]*[ɪɑæəʊɛ][btkzɹsjmfgndɫwpθvhʃðʒŋ]*[btkzɹsjmfgndɫwpθvhʃðʒŋ]$', word):
                cvc += 1
        
        CVC_short.append(cvc / total_words)
    
    return CVC_short

## Within-Word Pattern Stage

#### Check for basic inflectionals

In [None]:
def check_basic_inflectional(dataset):
    
    verb_tags = ['VBD', 'VBG', 'VBN', 'VBZ']
    text_POS = list(zip(dataset['text'], dataset['POS']))
    inflectional = []
    
    for item in tqdm(text_POS):
        total_words = 0
        inf_end = 0
        i = 0
        for word in item[0]:
            total_words += 1
            if item[1][i] in verb_tags:
                if re.search('es$', word):
                    inf_end += 1
                if re.search('s$', word):
                    inf_end += 1
            i += 1
            
        inflectional.append(inf_end / total_words)
    
    return inflectional

#### Check for complex consonants

In [None]:
def check_complex_cons(dataset):
    
    _complex = []
    text_ipa = list(zip(dataset['text'], dataset['ipa']))
    
    for item in tqdm(text_ipa):
        total_words = 0
        com_cons = 0
        i = 0
        for word in item[0]:
            total_words += 1
            sylls = magic_e(word)
            if sylls == 1:
                if re.search('g', item[0][i]):
                    if not re.search('g', item[1][i]):
                        com_cons += 1
                if re.search('^w', item[0][i]):
                    if not re.search('w', item[1][i]):
                        com_cons += 1
                if re.search('c', item[0][i]):
                    if not re.search('k', item[1][i]):
                        com_cons += 1
                if re.search('k', item[0][i]):
                    if not re.search('k', item[1][i]):
                        com_cons += 1
                if re.search('dʒ$', item[1][i]):
                    com_cons += 1
                if re.search('se$', item[0][i]):
                    if re.search('z$', item[1][i]):
                        com_cons += 1
                if re.search('b$', item[0][i]):
                    if not re.search('b', item[1][i]):
                        com_cons += 1
                if re.search('ce$', item[0][i]):
                    if re.search('s$', item[1][i]):
                        com_cons += 1
                if re.search('[btkzrsjmfndlwpvhg]ch$', item[0][i]):
                    com_cons += 1
                if re.search('[btkzrsjmfndlwpvhg]ge$', item[0][i]):
                    com_cons += 1
                    
            i += 1
            
        _complex.append(com_cons / total_words)
        
    return _complex

## Syllables & Affixes Stage

#### Check type of syllable juncture

In [None]:
def VV_word(row):
    
    DIPHTHONGS = ['aɪ', 'eɪ', 'ɪə', 'ɔɪ', 'aʊ', 'oʊ', 'ʊə', 'eə']
    
    vv = False
    vv_dict = {}
    for word in row:
        if re.search('[ɪɑæəʊɛiuɔaoe][ɪɑæəʊɛiuɔaoe][ɪɑæəʊɛiuɔaoe]', word):
            vv = True
        elif re.search('[ɪɑæəʊɛiuɔaoe][ɫɝ]', word):
            vv = True
        elif re.search('[ɪɑæəʊɛiuɔaoe][ɪɑæəʊɛiuɔaoe]', word):
            result = re.findall('[ɪɑæəʊɛiuɔaoe][ɪɑæəʊɛiuɔaoe]', word)
            for res in result:
                if res not in DIPHTHONGS:
                    vv = True
        vv_dict[word] = vv
        
    return vv_dict
    
    
def VCCV_doublet_word(row):
    
    vccv_doublet_dict = {}
    vccv = False
    for word in row:
        sylls = magic_e(word)
        if sylls < 2:
            break
        if re.search('[aeiou][btkzrsjmfndlwpvhg][btkzrsjmfndlwpvhg][aeiou]', word):
            result = magic_e_result(word)
            if result[0][-1] in CONSONANTS_TEXT:
                con = result[0][-1]
                if con == result[1][0]:
                    vccv = True
        vccv_doublet_dict[word] = vccv
        
    return vccv_doublet_dict
    

def VCCV_word(row):

    vccv_dict = {}
    vccv = False
    for word in row:
        sylls = magic_e(word)
        if sylls < 2:
            break
        if re.search('[aeiou][btkzrsjmfndlwpvhg][btkzrsjmfndlwpvhg][aeiou]', word):
            result = magic_e_result(word)
            if result[0][-1] in CONSONANTS_TEXT:
                con = result[0][-1]
                if con != result[1][0]:
                    vccv = True
        vccv_dict[word] = vccv
        
    return vccv_dict


def VCCCV_word(row):
    
    vcccv_list = {}
    vcccv = False
    for word in row:
        sylls = magic_e(word)
        if sylls == 2:
            if re.search('[aeiou][btkzrsjmfndlwpvhg][btkzrsjmfndlwpvhg][btkzrsjmfndlwpvhg][aeiou]', word):
                vcccv = True
                    
        vcccv_list[word] = vcccv
            
    return vcccv_list


def VVCV_word(row):
    
    vvcv_list = {}
    vvcv = False
    for word in row:
        sylls = magic_e(word)
        if sylls == 2:
            if re.search('[aeiou][aeiou][btkzrsjmfndlwpvhg][aeiou]', word):
                vvcv = True
                    
        vvcv_list[word] = vvcv
            
    return vvcv_list

## Derivational Relations Stage

#### Check for advanced suffixes

In [None]:
def check_adv_suffix_word(row):
    
    adjs_nouns = ['JJR', 'JJS', 'JJ', 'NN', 'NNP', 'NNS']
    verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBZ', 'VBP']
    
    adj_n_suffix = ['ɛɹi$', 'ɔɹi$', 'ənsi$', 'əns$', 'ʒən', 'ʒən', 'ʃən', 'əbəɫ$', 'əbɫi$']
    v_suffix = ['aɪz', 'ɪfaɪ', 'əfaɪ']
    
    pos = get_POS(row)
    ipa = []
    ret_dict = {}
    for word in row:
        ipa.append(ipa_word(word))
        ret_dict[ipa_word(word)] = word
    ipa_text = list(zip(ipa, row, pos))
    i = 0
    retWords = []
    for word in ipa:
        if len(word) > 0:
            if magic_e(row[i]) > 1:
                if pos[i] in adjs_nouns:
                    for suf in adj_n_suffix:
                        if re.search(suf, word):
                            if ret_dict[word] not in retWords:
                                retWords.append(ret_dict[word])
                if pos[i] in verbs:
                    for suf in v_suffix:
                        if re.search(suf, word):
                            if ret_dict[word] not in retWords:
                                retWords.append(ret_dict[word])

        i += 1
        
    return retWords

#### Check for assimilated prefixes

In [None]:
def check_assimilated_row(row):
    assimilated = 0
    total_words = 0
    retWords = []
    for word in row:
        total_words += 1
        if len(magic_e_result(word)) > 1:
            if re.search('^ill', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^imm[aeiou]', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^imp', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^irr[aeiou]', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^suff', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^supp', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^succ', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^surr', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^coll', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^corr', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^att', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^aff', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^agg', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^all', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^ann', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^app', word):
                if not re.search('apples', word):
                    assimilated += 1
                    if word not in retWords:
                        retWords.append(word)
            if re.search('^ass', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^arr', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^diff', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^eff', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^opp', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^off', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
            if re.search('^occ', word):
                assimilated += 1
                if word not in retWords:
                    retWords.append(word)
                    
    return retWords