In [33]:
import re
import pandas as pd

In [34]:
def open_and_read(infile_path_and_name):
    """
    opens a file and returns a long long long string
    
    :infile_path_and_name: string of the path/path/txtFileName.txt from the current directory
    
    
    """
    with open(infile_path_and_name, 'r') as file:
        variable = file.read().replace('\n', '')
        
    return variable

In [35]:
def split_sentence(sentence: str) -> list:
    """
    Takes a sentence in IPA and parses it to individual words by breaking according to
    the " # " IPA string pattern.
    
    :sentence: sentence to parse
    
    :returns: list of individual words
    :rtype: list
    
    """
    words = sentence.split(" # ")
    return words

In [36]:
def rm_stress(word_list):
    """
    Takes a list of strings in IPA that contain prosodic accent marks and removes
    the dashes to clean the data.
    
    :word_list: list of strings
    
    :returns: list of strings without prosodic accent marks
    :rtype: list of strings
    
    """
    new_list = []
    for s in range(len(word_list)):
        word = word_list[s]
        new_word = re.compile(r"'").sub("",word)
        new_list.append(new_word)
    return(new_list)

In [37]:
def syllabize_further(word: str) -> list:
    """
    Takes a string with syllable hyphens and breaks it apart into a list of syllables
    
    :word: str: word form with hyphens marking syllable
    
    :returns: list of individual syllables
    :rtype: list
    
    """
    syllables = word.split("-")
    return syllables

In [38]:
def vowel_lists_append(prescrip_string, descrip_string, prescrip_vowel_ls, descrip_vowel_ls):
    """
    Takes two lists of strings and two strings and appends the vowels of the new strings on to the list of vowels
    
    :prescrip_string: the syllable with the 'correct' vowell
    :descrip_string: the syllable with the student's pronunciation of the vowel
    :prescrip_vowel_ls: the list of all of the 'correct' vowel pronunciations
    :descrip_vowel_ls: the list with all of the student's pronunciations of the vowels
    
    
    :returns: dataframe with lots of good data
    :rtype: pandas DataFrame
    
    """
    
    long_vowel_list = ['a:','e:','i:','o:','u:','ɛ:','æ:','ə:','ʌ:','ɪ:','ɔ:','ɑ:','ʊ:']
    diphthong_list =['au̯','eu̯','iu̯','ou̯','uu̯','ɛu̯','æu̯','əu̯','ʌu̯','ɪu̯','ɔu̯','ɑu̯','ʊu̯',
                     'ai̯','ei̯','ii̯','oi̯','ui̯','ɛi̯','æi̯','əi̯','ʌi̯','ɪi̯','ɔi̯','ɑi̯','ʊi̯',
                     'i̯a','i̯e','i̯i','i̯o','i̯u','i̯ɛ','i̯æ','i̯ə','i̯ʌ','i̯ɪ','i̯ɔ','i̯ɑ','i̯ʊ',
                     'u̯a','u̯e','u̯i','u̯o','u̯u','u̯ɛ','u̯æ','u̯ə','u̯ʌ','u̯ɪ','u̯ɔ','u̯ɑ','u̯ʊ']
                             
    vowel_list = ['a', 'e', 'i', 'o', 'u']
    semivowwel_list =['i̯','u̯',]
    pure_vowel_list = ['a','e','i','o','u','ɛ','æ','ə','ʌ','ɪ','ɔ','ɑ','ʊ']

    boolean = True
    
    while boolean == True:
        if boolean == True:
            for s in range(len(diphthong_list)):
                if diphthong_list[s] in prescrip_string:
                    prescrip_vowel_ls.append(diphthong_list[s])
                    boolean = False
                    break
                else:
                    boolean = True
 

            if boolean == True:
                for i in range(len(vowel_list)):
                    if vowel_list[i] in prescrip_string:
                        prescrip_vowel_ls.append(vowel_list[i])
                        boolean = False
                        break
                    else:
                        boolean = True
        
    
    boolean1 = True
    
    while boolean1 == True:
        
        if boolean1 == True:
            for j in range(len(long_vowel_list)):
                if long_vowel_list[j] in descrip_string:
                    descrip_vowel_ls.append(long_vowel_list[j])
                    boolean1 = False
                    break

            if boolean1 == True:
                for q in range(len(diphthong_list)):
                    if diphthong_list[q] in descrip_string:
                        descrip_vowel_ls.append(diphthong_list[q])
                        boolean1 = False
                        break

            if boolean1 == True:
                for z in range(len(pure_vowel_list)):
                    if pure_vowel_list[z] in descrip_string:
                        descrip_vowel_ls.append(pure_vowel_list[z])
                        boolean1 = False
                        break


    
    #print("Prescriptive vowel list:")
    #print(prescrip_vowel_ls)
    #print("Descriptive vowel list:")
    #print(descrip_vowel_ls)

In [39]:
#descriptive_ls = ['a:']
#prescriptive_ls = ['a']

#descriptive = 'hei̯'
#prescriptive = 'hee'

In [40]:
def string_list_phoneme_compare(response,answer):
    """
    Takes two lists of (IPA) strings--the student response (response) and the correct answer (answer)--
    and compares them against eachother. Then, the function will (1) find mismatches, and 
    (2) return the word, the string index, the correct allophone, and the discrepancy (i.e. the incorrect 
    allophone[s].)
    
    Parameters
    ----------
    response : list of strings
    answer : list of strings
    
    AS OF RIGHT NOW:
        (1) lists of strings must be of equal length
        (2) the number of syllables per string must be of equal length
            
    
    :returns: a table with the aforementioned data
    :rtype: dataframe 
    
    """
    word_index = []
    word = []
    student_pronunciation = []
    indexes = []
    student_allophones = []
    prescriptive_allophones = []
    binary = []
    
    p_syllable_lengths = []
    d_syllable_lengths = []
    
    syllable_header =[]
    p_syllables = []
    d_syllables = []
    syllable_number =[]
    
    if len(response) == len(answer):
        
        for s in range(len(response)): # first nested loop iterates through the strings in the list

            # assign variables to both of the strings because you will be appending them multiple times later on
            answer_word = answer[s]
            response_word = response[s]

            # break each string into a smaller list of individual syllables
            d_list_of_syllables = syllabize_further(response[s]) 
            p_list_of_syllables = syllabize_further(answer[s])

            #p_syllables[s] = p_list_of_syllables
            #d_syllables[s] = d_list_of_syllables

            if len(d_list_of_syllables) == len(p_list_of_syllables): # check to see if there are the same amount of syllables inside each word

                for j in range(len(p_list_of_syllables)): # now iterating through each syllable in the word list (j = syllable number)

                    # assign each syllable to a unique string variable for further iteration
                    descriptive_syllable = d_list_of_syllables[j]
                    prescriptive_syllable = p_list_of_syllables[j]

                    vowel_lists_append(prescriptive_syllable, descriptive_syllable, prescriptive_allophones, student_allophones)

                    word_index.append(s)
                    word.append(answer_word)
                    student_pronunciation.append(response_word)
                    syllable_number.append(j)
                    d_syllables.append(descriptive_syllable)
                    p_syllables.append(prescriptive_syllable)

            else:
                print("Word " + (str(s)) + " in the student response has the wrong number of syllables." )


    #else:
        #print("The two lists do not contain the same amount of strings.")

    data = pd.DataFrame(list(zip(word_index, word, student_pronunciation, syllable_number, p_syllables, d_syllables, prescriptive_allophones,
                                 student_allophones)), columns =['word_number','prescriptive_pronunciation','student_pronunciation','syllable_number','prescriptive_syllable','student_syllable','correct_allophone','student_allophone'])
        
   
            

    return(data)
                        
                    

In [41]:
prescriptive_string = open_and_read("transcriptions/prescriptive_ls.txt")
partic01_string = open_and_read("transcriptions/partic01.txt")
partic02_string = open_and_read("transcriptions/partic02.txt")
partic03_string = open_and_read("transcriptions/partic03.txt")
partic04_string = open_and_read("transcriptions/partic04.txt")
partic05_string = open_and_read("transcriptions/partic05.txt")
partic07_string = open_and_read("transcriptions/partic07.txt")

In [42]:
prescriptive_raw = split_sentence(prescriptive_string)
partic01_raw = split_sentence(partic01_string)
partic02_raw = split_sentence(partic02_string)
partic03_raw = split_sentence(partic03_string)
partic04_raw = split_sentence(partic04_string)
partic05_raw = split_sentence(partic05_string)
partic07_raw = split_sentence(partic07_string)

In [43]:
prescriptive = rm_stress(prescriptive_raw)
partic01 = rm_stress(partic01_raw)
partic02 = rm_stress(partic02_raw)
partic03 = rm_stress(partic03_raw)
partic04 = rm_stress(partic04_raw)
partic05 = rm_stress(partic05_raw)
partic07 = rm_stress(partic07_raw)

In [44]:
data = [partic01,partic02,partic03,partic04,partic05,partic07]

In [45]:
len(prescriptive)

25

In [46]:
def datasheet_compile(list_of_lists, prescriptive_list):
    """
    Takes a list of lists of strings and turns them into a list of dataframes that contain all of the information we are looking for
    (see string_list_phoneme_compare function for more information)
    
    :list_of_lists: a list of lists of strings; all lists must be the same length
    :prescriptive_list: a list of strings; prescriptive list must be same len as all lists in list_of_lists
    
    :returns: list of dataframes with all of the good data we want
    :rtype: list of pandas DataFrames
    
    """
    ls_of_dfs = []
    for i in range(len(list_of_lists)):
        current_list = list_of_lists[i]
        current_df = string_list_phoneme_compare(current_list, prescriptive_list)
        ls_of_dfs.append(current_df)
        
    return(ls_of_dfs)        

In [48]:
dictionary = pd.read_csv("dictionary.csv")
dictionary

Unnamed: 0,word,vowel_focus1,vowel_focus2,init_vowel,term_vowel,cognate
0,taza,a,,0,1,0
1,sabe,e,a,0,1,0
2,casi,i,a,0,1,0
3,tanto,o,a,0,1,0
4,su,u,,0,1,0
5,ayuda,a,u,1,1,0
6,educación,e,u,1,0,1
7,importante,i,e,1,1,1
8,oportunidad,o,i,1,0,1
9,usar,u,,1,0,1


In [57]:
vowel_init = dictionary[dictionary['init_vowel'] == 1] # count vowel-initial words in df
vowel_init_count = len(vowel_init)
print("Words in set that are vowel initial: " + str(vowel_init_count))

term_vowel = dictionary[dictionary['term_vowel'] == 1]
term_vowel_count = len(term_vowel)
print("Words in set that are terminal vowel: " + str(term_vowel_count))

cognates = dictionary[dictionary['cognate'] == 1]
cognates_count = len(cognates)
print("Words in set that are cognates: " + str(cognates_count))

Words in set that are vowel initial: 15
Words in set that are terminal vowel: 17
Words in set that are cognates: 6


In [32]:
dfs = datasheet_compile(data,prescriptive)
dfs[1]

Unnamed: 0,word_number,prescriptive_pronunciation,student_pronunciation,syllable_number,prescriptive_syllable,student_syllable,correct_allophone,student_allophone
0,0,i-se,i-se,0,i,i,i,i
1,0,i-se,i-se,1,se,se,e,e
2,1,kom-bi-na-si̯on,kom-bi-na-si̯on,0,kom,kom,o,o
3,1,kom-bi-na-si̯on,kom-bi-na-si̯on,1,bi,bi,i,i
4,1,kom-bi-na-si̯on,kom-bi-na-si̯on,2,na,na,a,a
5,1,kom-bi-na-si̯on,kom-bi-na-si̯on,3,si̯on,si̯on,i̯o,i̯o
6,2,i-ni-si̯o,i-ni-si̯o,0,i,i,i,i
7,2,i-ni-si̯o,i-ni-si̯o,1,ni,ni,i,i
8,2,i-ni-si̯o,i-ni-si̯o,2,si̯o,si̯o,i̯o,i̯o
9,3,e-ðu-ka-si̯on,e-du-ka-si̯on,0,e,e,e,e


In [15]:
string_list_phoneme_compare(partic02,prescriptive)

Unnamed: 0,word_number,prescriptive_pronunciation,student_pronunciation,syllable_number,prescriptive_syllable,student_syllable,correct_allophone,student_allophone
0,0,i-se,i-se,0,i,i,i,i
1,0,i-se,i-se,1,se,se,e,e
2,1,kom-bi-na-si̯on,kom-bi-na-si̯on,0,kom,kom,o,o
3,1,kom-bi-na-si̯on,kom-bi-na-si̯on,1,bi,bi,i,i
4,1,kom-bi-na-si̯on,kom-bi-na-si̯on,2,na,na,a,a
5,1,kom-bi-na-si̯on,kom-bi-na-si̯on,3,si̯on,si̯on,i̯o,i̯o
6,2,i-ni-si̯o,i-ni-si̯o,0,i,i,i,i
7,2,i-ni-si̯o,i-ni-si̯o,1,ni,ni,i,i
8,2,i-ni-si̯o,i-ni-si̯o,2,si̯o,si̯o,i̯o,i̯o
9,3,e-ðu-ka-si̯on,e-du-ka-si̯on,0,e,e,e,e
