# Learnability Project: PFA Phonotactic Learner


In [63]:
from pyfoma import FST, State
from collections import Counter

Lfish = "⋊"
Rfish = "⋉"


def data_iterator(path):
    """Yield a word as a list of tokens"""
    with open (path, "r") as fin:
        for line in fin.readlines():
            yield line.split()
            
def make_alphabet(path) -> list:
    alph = []
    for word in data_iterator(path):
        for sym in word:
            if sym not in alph:
                alph.append(sym)
    return alph
    
def make_2_SL_dfa(alphabet:list) -> (FST, set[State]):
    """This function initializes a 2-SL PFA according to an alphabet. All weights are set to 0 by default"""
    
    # create the FST object and instantiate the initial state
    dfa = FST()
    q0 = dfa.initialstate
    q0.finalweight = 0
    q0.name = Lfish
    states = {q0,}
    
    # initialize all 42 states with incoming q0 transitions
    for symbol in alphabet:
        q = State()
        q.name = symbol
        q0.add_transition(q, symbol, 0)
        states.add(q)
        q.finalweight = 0
    
    # loopify the states, excluding q0
    states_no_q0 = set([s for s in states if s.name != Lfish])
    for state1 in states_no_q0:
        for state2 in states_no_q0:
            state1.add_transition(state2, state2.name, 0)
    
    dfa.states      = states
    dfa.finalstates = states
    dfa.alphabet    = alphabet
    return dfa

        
def MLE(dfa:FST, path):
    
    
    # this block updates each transition whenever it is "passed through"
    # Specifically, it iterates over the outgoing transitions for each state, checks which transition label
    # matches the current token in the word, and then increments that transition's weight by 1
    for word in data_iterator(path):
        cs = dfa.initialstate # gets set back to q0 when a new word is processed
        for token in word:
            for _, trans in cs.all_transitions():
                
                if trans.label == token:
                    trans.weight += 1
                    cs = trans.targetstate # update cs with the transition's target state
                    
        cs.finalweight += 1 # update the weight of the last state the current string ends in
                    
                    
    

    for state in dfa.states:
        print(f"STATE: {state.name}, WEIGHT: {state.finalweight}")


In [64]:
# toy_alphabet = make_alphabet("data/LearningData.txt")
# toy_2SL = make_2_SL_dfa(toy_alphabet)
# toy_MLE = MLE(toy_2SL, "data/LearningData.txt")

toy_alphabet = make_alphabet("data/eric_debugging_data.txt")
toy_2SL = make_2_SL_dfa(toy_alphabet)
toy_MLE = MLE(toy_2SL, "data/eric_debugging_data.txt")



STATE: b, WEIGHT: 5
STATE: a, WEIGHT: 9
STATE: ⋊, WEIGHT: 0
STATE: c, WEIGHT: 1


##  notes

MLE(dfa, corpus) = dfa' 

create_SL2_dfa (alphabet

getalphabet(corpus) = set_of_letters 

Q = all strings of length less than k

I = "" 

F = Q

delta(q,a) = k-1 suffix of qa 

example: (a,b) = k-1 suffix of ab which is b (k=2)

Example; Suppose k=4. Then delta(aaa,b) = 3 suffix of aaab, which is aab 

Also if qa is of length less than k-1 than the k-1 suffix of qa is just qa 

3 suffix of aa is aa

BIGRAM: all possible states given alphabet (given k)

42 SP machines, 1 SL machine

### NOTE:
Multi-symbol labels get delimited by colons when displayed via `render`. So, "abc" -> "a:b:c" (for some really odd reason)