In [2]:
import pywrapfst as fst
import graphviz

In [69]:
def create_plural_fst():
    front_vowels = ['e', 'i', 'ö', 'ü']
    back_vowels = ['a', 'ı', 'o', 'u']
    consonants = "bcçdfgğhjklmnprsştvyz"  # Add more if needed

    sym_table = fst.SymbolTable()
    sym_table.add_symbol('<eps>', 0)
    for char in front_vowels + back_vowels + list(consonants) + ['ler', 'lar', '.']:
        sym_table.add_symbol(char)

    plural_fst = fst.Fst()
    plural_fst.set_input_symbols(sym_table)
    plural_fst.set_output_symbols(sym_table)

    one = fst.Weight('tropical', 1.0)

    start_state = plural_fst.add_state()
    front_state = plural_fst.add_state()
    back_state = plural_fst.add_state()
    end_state = plural_fst.add_state()

    plural_fst.set_start(start_state)
    plural_fst.set_final(end_state)

    for vowel in front_vowels:
        plural_fst.add_arc(start_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, front_state))
        plural_fst.add_arc(back_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, front_state))
        plural_fst.add_arc(front_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, front_state))
    for vowel in back_vowels:
        plural_fst.add_arc(start_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, back_state))
        plural_fst.add_arc(front_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, back_state))
        plural_fst.add_arc(back_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, back_state))

    plural_fst.add_arc(front_state, fst.Arc(sym_table.find('.'), sym_table.find('ler'), one, end_state))
    plural_fst.add_arc(back_state, fst.Arc(sym_table.find('.'), sym_table.find('lar'), one, end_state))

    # Looping back for consonants and non-vowel characters
    for consonant in consonants:
        consonant_id = sym_table.find(consonant)
        plural_fst.add_arc(start_state, fst.Arc(consonant_id, consonant_id, one, start_state))
        plural_fst.add_arc(front_state, fst.Arc(consonant_id, consonant_id, one, front_state))
        plural_fst.add_arc(back_state, fst.Arc(consonant_id, consonant_id, one, back_state))
    
    return plural_fst

# Create and test the FST
plural_fst = create_plural_fst()

# Save and visualize the FST using external tools
plural_fst.write('plural.fst')
!fstdraw plural.fst | dot -Tpng > plural.png

In [70]:
def transduce_sequence_det(f, seq):
    eps = f.input_symbols().find('<eps>')
    curr_state = f.start()
    output = []

    for char in seq:
        label = f.input_symbols().find(char)
        if label == -1:
            print(f"Character '{char}' not found in FST's input symbols.")
            return []

        found = False
        for arc in f.arcs(curr_state):
            if arc.ilabel == label:
                output.append(arc.olabel)
                curr_state = arc.nextstate
                found = True
                break

        if not found:
            print(f"No transition for '{char}' in current state.")
            return []

    final_weight = float(f.final(curr_state))
    if final_weight != math.inf:  # if this is a final state
        out_seq = [f.output_symbols().find(w) for w in output if w != eps]
        return out_seq
    else:
        print("Reached a non-final state at the end of the sequence.")
        return []

In [74]:
import math
def pluralize_word(word, plural_fst):
    char_sequence = list(word)
    char_sequence = char_sequence + ['.']
    transduced_sequence = transduce_sequence_det(plural_fst, char_sequence)

    # Decode each byte string in the sequence to a regular string
    decoded_sequence = [symbol.decode('utf-8') if isinstance(symbol, bytes) else symbol for symbol in transduced_sequence]

    # Join the sequence to form the pluralized word
    pluralized_word = ''.join(decoded_sequence)
    return pluralized_word
    

# Example usage
plural_fst = create_plural_fst()  # Create your plural FST and ensure it's determinized
word = 'kalem'  # Example word in Turkish
pluralized_word = pluralize_word(word, plural_fst)
print(pluralized_word)  # Should output the plural form of 'elma'

kalemler


In [76]:
def possesive_fst():
    _table = fst.SymbolTable()
    eps = _table.add_symbol('<eps>', 0)
    
    _suffix_list = ['Hm', 'Hn', 'sH', 'H', 'mHz', 'HmHz', 'nHz', 'HnHz', 'lArH',
                    '+1ps', '+1pm', '+2ps', '+2pm', '+3ps', '+3pm']
    for char in _suffix_list:
        _table.add_symbol(char)
        
    _fst = fst.Fst()
    _fst.set_input_symbols(_table)
    _fst.set_output_symbols(_table)

    one = None #fst.Weight('tropical', 1.0)
    
    start_state = _fst.add_state()
    fps_state = _fst.add_state()
    fpm_state = _fst.add_state()
    sps_state = _fst.add_state()
    spm_state = _fst.add_state()
    tps_state = _fst.add_state()
    tpm_state = _fst.add_state()
    end_state = _fst.add_state()
    
    _fst.set_start(start_state)
    _fst.set_final(end_state)
    
    #plural_fst.add_arc(back_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, front_state))
    
    _fst.add_arc(start_state, fst.Arc(0, 0, one, start_state))
    _fst.add_arc(start_state, fst.Arc(_table.find('+1ps'),_table.find('Hm'), one, fps_state))
    _fst.add_arc(start_state, fst.Arc(_table.find('+1pm'),_table.find('mHz'), one, fpm_state))
    _fst.add_arc(start_state, fst.Arc(_table.find('+1pm'),_table.find('HmHz'), one, fpm_state))
    _fst.add_arc(start_state, fst.Arc(_table.find('+2ps'),_table.find('Hn'), one, sps_state))
    _fst.add_arc(start_state, fst.Arc(_table.find('+2pm'),_table.find('nHz'), one, spm_state))
    _fst.add_arc(start_state, fst.Arc(_table.find('+2pm'),_table.find('HnHz'), one, spm_state))
    _fst.add_arc(start_state, fst.Arc(_table.find('+3ps'),_table.find('sH'), one, tps_state))
    _fst.add_arc(start_state, fst.Arc(_table.find('+3ps'),_table.find('H'), one, tps_state))
    _fst.add_arc(start_state, fst.Arc(_table.find('+3pm'),_table.find('lArH'), one, tpm_state))
    
    state_list = [fps_state, fpm_state, sps_state, spm_state, tps_state, tpm_state]
    
    for i in state_list:
        _fst.add_arc(i, fst.Arc(0, 0, one, end_state))
    
    
    return _fst

import math
def possessive_word(word, plural_fst, specifier):
    char_sequence = list(word)
    char_sequence = char_sequence + [specifier]
    transduced_sequence = transduce_sequence_det(plural_fst, char_sequence)

    # Decode each byte string in the sequence to a regular string
    decoded_sequence = [symbol.decode('utf-8') if isinstance(symbol, bytes) else symbol for symbol in transduced_sequence]

    # Join the sequence to form the pluralized word
    pluralized_word = ''.join(decoded_sequence)
    return pluralized_word

p_fst = possesive_fst()
word = 'kalem'  # Example word in Turkish
p_word = possessive_word(word, p_fst, '+1ps')
print(p_word)  # Should output the plural form of 'elma'

# Save and visualize the FST using external tools
p_fst.write('possessive.fst')
!fstdraw possessive.fst | dot -Tpng > possessive.png

Character 'k' not found in FST's input symbols.



In [46]:
def create_possessive_fst():
    # Create symbol tables
    sym_table = fst.SymbolTable()
    
    # Add a special symbol for epsilon transitions
    eps = sym_table.add_symbol('<eps>', 0)
        
    sym_table.add_symbol('eiöü')
    sym_table.add_symbol('aıou')

    # Add symbols for possessive suffixes
    sym_table.add_symbol('im')
    sym_table.add_symbol('ım')

    # Create a new FST
    possessive_fst = fst.Fst()

    # Set the symbol tables for the FST
    possessive_fst.set_input_symbols(sym_table)
    possessive_fst.set_output_symbols(sym_table)

    # Define the weight for the arcs
    one = fst.Weight('tropical', 1.0)

    # Add states
    start_state = possessive_fst.add_state()
    front_vowel_state = possessive_fst.add_state()
    back_vowel_state = possessive_fst.add_state()
    end_state = possessive_fst.add_state()

    # Set the start state
    possessive_fst.set_start(start_state)

    # Front vowel transitions
    vowel = 'eiöü'
    possessive_fst.add_arc(start_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, front_vowel_state))

    # Back vowel transitions
    vowel = 'aıou'
    possessive_fst.add_arc(start_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, back_vowel_state))

    # Transition to possessive suffixes
    possessive_fst.add_arc(front_vowel_state, fst.Arc(eps, sym_table.find('im'), one, end_state))
    possessive_fst.add_arc(back_vowel_state, fst.Arc(eps, sym_table.find('ım'), one, end_state))

    # Set the final state
    possessive_fst.set_final(end_state)

    return possessive_fst

# Function to add possessive suffix to the word
def add_possessive_suffix(word, possessive_fst):
    # Create a tropical weight of one for transitions
    one = fst.Weight('tropical', 1.0)

    # Convert the input word to an FST
    word_fst = fst.Fst()
    word_fst.set_input_symbols(possessive_fst.input_symbols())
    word_fst.set_output_symbols(possessive_fst.output_symbols())

    current_state = word_fst.add_state()
    word_fst.set_start(current_state)
    
    # Add the word's characters to the FST
    for char in word:
        next_state = word_fst.add_state()
        word_fst.add_arc(current_state, fst.Arc(possessive_fst.input_symbols().find(char), 
                                                 possessive_fst.output_symbols().find(char), 
                                                 one, 
                                                 next_state))
        current_state = next_state

    # Add an epsilon transition to allow for the possessive suffix
    word_fst.add_arc(current_state, fst.Arc(possessive_fst.input_symbols().find('<eps>'), 
                                             possessive_fst.output_symbols().find('<eps>'), 
                                             one, 
                                             current_state))

    word_fst.set_final(current_state)

    # Compose the word FST with the possessive FST
    result_fst = fst.compose(word_fst, possessive_fst)

    # Project on the output to get the final string
    result_fst.project(project_output=True)

    # Extract the resulting string
    result_string = ""
    for state in result_fst.states():
        for arc in result_fst.arcs(state):
            if arc.olabel != 0:
                result_string += possessive_fst.output_symbols().find(arc.olabel)

    return result_string

In [47]:
# Create the possessive FST
possessive_fst = create_possessive_fst()
# Test the FST with a word
word = 'kitap'
possessive_word = add_possessive_suffix(word, possessive_fst)
print("sdfsd" + possessive_word)  # Should output 'kitapım' or 'kitabım' depending on the rules defined

possessive_fst.write('possessive.fst')
!fstdraw possessive.fst | dot -Tpng > possessive.png

sdfsd
