# Learnability Project: PFA Phonotactic Learner


In [213]:
from pyfoma import FST, State

fish = "⋉"

def data_iterator(path):
    """Yield a word as a list of tokens"""
    with open (path, "r") as fin:
        for line in fin.readlines():
            yield line.split()
            
def make_alphabet(path_to_corpus) -> list:
    alph = []
    for word in data_iterator(path_to_corpus):
        for sym in word:
            if sym not in alph:
                alph.append(sym)
    return alph

def make_2_SL_dfa(alphabet:list) -> (FST, set[State]):
    """This function initializes a 2-SL PFA. All weights are set to 0 by default"""
    fst = FST()
    q0 = fst.initialstate
    q0.finalweight = 0
    q0.name = "q0"
    states = {q0,}
    
    # initialize all 42 states
    for symbol in alphabet:
        q = State()
        q.name = symbol
        q0.add_transition(q, symbol, 0)
        states.add(q)
        q.finalweight = 0
    
    # loopify the states
    states_no_q0 = set([s for s in states if s.name != "q0"])
    for state1 in states_no_q0:
        for state2 in states_no_q0:
            state1.add_transition(state2, state2.name, 0)
    
    fst.states      = states
    fst.finalstates = set(states)
    fst.alphabet    = set(alphabet)
    return fst
    
    
    


toy_alphabet = make_alphabet("data/LearningData.txt")
toy_2SL = make_2_SL_dfa(toy_alphabet)

In [228]:

for word in toy_2SL.apply("NaNaura", weights=True):
    print(word)


('NaNaura', 0.0)


##  notes

MLE(dfa, corpus) = dfa' 

create_SL2_dfa (alphabet

getalphabet(corpus) = set_of_letters 

Q = all strings of length less than k

I = "" 

F = Q

delta(q,a) = k-1 suffix of qa 

example: (a,b) = k-1 suffix of ab which is b (k=2)

Example; Suppose k=4. Then delta(aaa,b) = 3 suffix of aaab, which is aab 

Also if qa is of length less than k-1 than the k-1 suffix of qa is just qa 

3 suffix of aa is aa

BIGRAM: all possible states given alphabet (given k)

42 SP machines, 1 SL machine

### NOTE:
Multi-symbol labels get delimited by colons when displayed via `render`. So, "abc" -> "a:b:c" (for some really odd reason)