In [2]:
import pywrapfst as fst
import graphviz

In [3]:
def create_plural_fst():
    front_vowels = ['e', 'i', 'ö', 'ü']
    back_vowels = ['a', 'ı', 'o', 'u']
    consonants = "bcçdfgğhjklmnprsştvyz"  # Add more if needed

    sym_table = fst.SymbolTable()
    sym_table.add_symbol('<eps>', 0)
    for char in front_vowels + back_vowels + list(consonants) + ['ler', 'lar', '+PL']:
        sym_table.add_symbol(char)

    plural_fst = fst.Fst()
    plural_fst.set_input_symbols(sym_table)
    plural_fst.set_output_symbols(sym_table)

    one = fst.Weight('tropical', 1.0)

    start_state = plural_fst.add_state()
    front_state = plural_fst.add_state()
    back_state = plural_fst.add_state()
    end_state = plural_fst.add_state()

    plural_fst.set_start(start_state)
    plural_fst.set_final(end_state)

    for vowel in front_vowels:
        plural_fst.add_arc(start_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, front_state))
        plural_fst.add_arc(back_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, front_state))
        plural_fst.add_arc(front_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, front_state))
    for vowel in back_vowels:
        plural_fst.add_arc(start_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, back_state))
        plural_fst.add_arc(front_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, back_state))
        plural_fst.add_arc(back_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, back_state))

    plural_fst.add_arc(front_state, fst.Arc(sym_table.find('+PL'), sym_table.find('ler'), one, end_state))
    plural_fst.add_arc(back_state, fst.Arc(sym_table.find('+PL'), sym_table.find('lar'), one, end_state))

    # Looping back for consonants and non-vowel characters
    for consonant in consonants:
        consonant_id = sym_table.find(consonant)
        plural_fst.add_arc(start_state, fst.Arc(consonant_id, consonant_id, one, start_state))
        plural_fst.add_arc(front_state, fst.Arc(consonant_id, consonant_id, one, front_state))
        plural_fst.add_arc(back_state, fst.Arc(consonant_id, consonant_id, one, back_state))
    
    return plural_fst

# Create and test the FST
plural_fst = create_plural_fst()

# Save and visualize the FST using external tools
plural_fst.write('plural.fst')
!fstdraw plural.fst | dot -Tpng > plural.png

In [4]:
def transduce_sequence_det(f, seq):
    eps = f.input_symbols().find('<eps>')
    curr_state = f.start()
    output = []

    for char in seq:
        label = f.input_symbols().find(char)
        if label == -1:
            print(f"Character '{char}' not found in FST's input symbols.")
            return []

        found = False
        for arc in f.arcs(curr_state):
            if arc.ilabel == label:
                output.append(arc.olabel)
                curr_state = arc.nextstate
                found = True
                break

        if not found:
            print(f"No transition for '{char}' in current state.")
            return []

    final_weight = float(f.final(curr_state))
    if final_weight != math.inf:  # if this is a final state
        out_seq = [f.output_symbols().find(w) for w in output]
        return out_seq
    else:
        print("Reached a non-final state at the end of the sequence.")
        return []

In [5]:
import math
def pluralize_word(word, plural_fst):
    char_sequence = list(word)
    char_sequence = char_sequence + ['+PL']
    transduced_sequence = transduce_sequence_det(plural_fst, char_sequence)

    # Decode each byte string in the sequence to a regular string
    decoded_sequence = [symbol.decode('utf-8') if isinstance(symbol, bytes) else symbol for symbol in transduced_sequence]

    # Join the sequence to form the pluralized word
    pluralized_word = ''.join(decoded_sequence)
    return pluralized_word
    

# Example usage
plural_fst = create_plural_fst()  # Create your plural FST and ensure it's determinized
word = 'kalem'  # Example word in Turkish
pluralized_word = pluralize_word(word, plural_fst)
print(pluralized_word)  # Should output the plural form of 'elma'

kalemler


In [9]:
def possessive_fst():
    _table = fst.SymbolTable()
    eps = _table.add_symbol('<eps>', 0)
    
    _suffix_list = ['ım', 'im', 'm', 'ın', 'in', 'n', 
                    'sı', 'si', 'ı', 'i', 'miz', 'mız', 
                    'ımız', 'imiz', 'nız', 'niz', 'ınız'
                    'iniz', 'leri', 'ları',
                    '+1ps', '+1pm', '+2ps', '+2pm', '+3ps', '+3pm', 
                    'BW', 'FW', 'C']
    
    front_vowels_ei = ['e', 'i']
    front_vowels_ou = ['ö', 'ü']
    back_vowels_ai = ['a', 'ı']
    back_vowels_ou = ['o', 'u']

    front_vowels = front_vowels_ei + front_vowels_ou
    back_vowels = back_vowels_ai + back_vowels_ou
    consonants = "bcçdfgğhjklmnprsştvyz"  # Add more if needed
    
    for char in _suffix_list + front_vowels + back_vowels + list(consonants):
        _table.add_symbol(char)
        
    _fst = fst.Fst()
    _fst.set_input_symbols(_table)
    _fst.set_output_symbols(_table)

    one = None #fst.Weight('tropical', 1.0)
    
    start_state = _fst.add_state()

    bw_ou_c_state = _fst.add_state()
    bw_ai_c_state = _fst.add_state()
    fw_ei_c_state = _fst.add_state()
    fw_ou_c_state = _fst.add_state()
    
    fw_ou_state = _fst.add_state()
    fw_ei_state = _fst.add_state()
    bw_ai_state = _fst.add_state()
    bw_ou_state = _fst.add_state()


    fps_state = _fst.add_state()
    fpm_state = _fst.add_state()
    sps_state = _fst.add_state()
    spm_state = _fst.add_state()
    tps_state = _fst.add_state()
    tpm_state = _fst.add_state()
    end_state = _fst.add_state()
    
    _fst.set_start(start_state)
    _fst.set_final(end_state)

    for vowel in front_vowels_ei:
        _fst.add_arc(start_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ei_state))
        _fst.add_arc(bw_ou_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ei_state))
        _fst.add_arc(bw_ai_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ei_state))
        _fst.add_arc(fw_ei_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ei_state))
        _fst.add_arc(fw_ou_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ei_state))
        _fst.add_arc(fw_ou_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ei_state))
        _fst.add_arc(fw_ei_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ei_state))

    for vowel in front_vowels_ou:
        _fst.add_arc(start_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ou_state))
        _fst.add_arc(bw_ou_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ou_state))
        _fst.add_arc(bw_ai_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ou_state))
        _fst.add_arc(fw_ei_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ou_state))
        _fst.add_arc(fw_ou_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ou_state))
        _fst.add_arc(fw_ou_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ou_state))
        _fst.add_arc(fw_ei_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, fw_ou_state))

    for vowel in back_vowels_ai:
        _fst.add_arc(start_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ai_state))
        _fst.add_arc(bw_ou_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ai_state))
        _fst.add_arc(bw_ai_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ai_state))
        _fst.add_arc(fw_ei_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ai_state))
        _fst.add_arc(fw_ou_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ai_state))
        _fst.add_arc(bw_ai_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ai_state))
        _fst.add_arc(bw_ou_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ai_state))

    for vowel in back_vowels_ou:
        _fst.add_arc(start_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ou_state))
        _fst.add_arc(bw_ou_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ou_state))
        _fst.add_arc(bw_ai_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ou_state))
        _fst.add_arc(fw_ei_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ou_state))
        _fst.add_arc(fw_ou_c_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ou_state))
        _fst.add_arc(bw_ai_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ou_state))
        _fst.add_arc(bw_ou_state, fst.Arc(_table.find(vowel), _table.find(vowel), one, bw_ou_state))

    #plural_fst.add_arc(back_state, fst.Arc(sym_table.find(vowel), sym_table.find(vowel), one, front_state))
    
    _fst.add_arc(fw_ei_state, fst.Arc(_table.find('+1ps'),_table.find('m'), one, end_state))
    _fst.add_arc(fw_ei_c_state, fst.Arc(_table.find('+1ps'),_table.find('im'), one, end_state))
    _fst.add_arc(fw_ei_state, fst.Arc(_table.find('+2ps'),_table.find('n'), one, end_state))
    _fst.add_arc(fw_ei_c_state, fst.Arc(_table.find('+2ps'),_table.find('in'), one, end_state))
    _fst.add_arc(fw_ei_state, fst.Arc(_table.find('+3ps'),_table.find('si'), one, end_state))
    _fst.add_arc(fw_ei_c_state, fst.Arc(_table.find('+3ps'),_table.find('i'), one, end_state))
    _fst.add_arc(fw_ei_state, fst.Arc(_table.find('+1pm'),_table.find('miz'), one, end_state))
    _fst.add_arc(fw_ei_c_state, fst.Arc(_table.find('+1pm'),_table.find('imiz'), one, end_state))
    _fst.add_arc(fw_ei_state, fst.Arc(_table.find('+2pm'),_table.find('niz'), one, end_state))
    _fst.add_arc(fw_ei_c_state, fst.Arc(_table.find('+2pm'),_table.find('iniz'), one, end_state))
    _fst.add_arc(fw_ei_state, fst.Arc(_table.find('+3pm'),_table.find('leri'), one, end_state))

    _fst.add_arc(fw_ou_state, fst.Arc(_table.find('+1ps'),_table.find('m'), one, end_state))
    _fst.add_arc(fw_ou_c_state, fst.Arc(_table.find('+1ps'),_table.find('üm'), one, end_state))
    _fst.add_arc(fw_ou_state, fst.Arc(_table.find('+2ps'),_table.find('n'), one, end_state))
    _fst.add_arc(fw_ou_c_state, fst.Arc(_table.find('+2ps'),_table.find('ün'), one, end_state))
    _fst.add_arc(fw_ou_state, fst.Arc(_table.find('+3ps'),_table.find('sü'), one, end_state))
    _fst.add_arc(fw_ou_c_state, fst.Arc(_table.find('+3ps'),_table.find('ü'), one, end_state))
    _fst.add_arc(fw_ou_state, fst.Arc(_table.find('+1pm'),_table.find('müz'), one, end_state))
    _fst.add_arc(fw_ou_c_state, fst.Arc(_table.find('+1pm'),_table.find('ümüz'), one, end_state))
    _fst.add_arc(fw_ou_state, fst.Arc(_table.find('+2pm'),_table.find('nüz'), one, end_state))
    _fst.add_arc(fw_ou_c_state, fst.Arc(_table.find('+2pm'),_table.find('ünüz'), one, end_state))
    _fst.add_arc(fw_ou_state, fst.Arc(_table.find('+3pm'),_table.find('leri'), one, end_state))

    _fst.add_arc(bw_ai_state, fst.Arc(_table.find('+1ps'),_table.find('m'), one, end_state))
    _fst.add_arc(bw_ai_c_state, fst.Arc(_table.find('+1ps'),_table.find('ım'), one, end_state))
    _fst.add_arc(bw_ai_state, fst.Arc(_table.find('+2ps'),_table.find('n'), one, end_state))
    _fst.add_arc(bw_ai_c_state, fst.Arc(_table.find('+2ps'),_table.find('ın'), one, end_state))
    _fst.add_arc(bw_ai_state, fst.Arc(_table.find('+3ps'),_table.find('sı'), one, end_state))
    _fst.add_arc(bw_ai_c_state, fst.Arc(_table.find('+3ps'),_table.find('ı'), one, end_state))
    _fst.add_arc(bw_ai_state, fst.Arc(_table.find('+1pm'),_table.find('mız'), one, end_state))
    _fst.add_arc(bw_ai_c_state, fst.Arc(_table.find('+1pm'),_table.find('ımız'), one, end_state))
    _fst.add_arc(bw_ai_state, fst.Arc(_table.find('+2pm'),_table.find('nız'), one, end_state))
    _fst.add_arc(bw_ai_c_state, fst.Arc(_table.find('+2pm'),_table.find('ınız'), one, end_state))
    _fst.add_arc(bw_ai_state, fst.Arc(_table.find('+3pm'),_table.find('ları'), one, end_state))

    _fst.add_arc(bw_ou_state, fst.Arc(_table.find('+1ps'),_table.find('m'), one, end_state))
    _fst.add_arc(bw_ou_c_state, fst.Arc(_table.find('+1ps'),_table.find('um'), one, end_state))
    _fst.add_arc(bw_ou_state, fst.Arc(_table.find('+2ps'),_table.find('n'), one, end_state))
    _fst.add_arc(bw_ou_c_state, fst.Arc(_table.find('+2ps'),_table.find('un'), one, end_state))
    _fst.add_arc(bw_ou_state, fst.Arc(_table.find('+3ps'),_table.find('su'), one, end_state))
    _fst.add_arc(bw_ou_c_state, fst.Arc(_table.find('+3ps'),_table.find('u'), one, end_state))
    _fst.add_arc(bw_ou_state, fst.Arc(_table.find('+1pm'),_table.find('mız'), one, end_state))
    _fst.add_arc(bw_ou_c_state, fst.Arc(_table.find('+1pm'),_table.find('umuz'), one, end_state))
    _fst.add_arc(bw_ou_state, fst.Arc(_table.find('+2pm'),_table.find('nız'), one, end_state))
    _fst.add_arc(bw_ou_c_state, fst.Arc(_table.find('+2pm'),_table.find('unuz'), one, end_state))
    _fst.add_arc(bw_ou_state, fst.Arc(_table.find('+3pm'),_table.find('ları'), one, end_state))
    
    
        
    
    # Looping back for consonants and non-vowel characters
    for consonant in consonants:
        consonant_id = _table.find(consonant)
        _fst.add_arc(start_state, fst.Arc(consonant_id, consonant_id, one, start_state))
        _fst.add_arc(bw_ai_state, fst.Arc(consonant_id, consonant_id, one, bw_ai_c_state))
        _fst.add_arc(bw_ou_state, fst.Arc(consonant_id, consonant_id, one, bw_ou_c_state))
        _fst.add_arc(fw_ei_state, fst.Arc(consonant_id, consonant_id, one, fw_ei_c_state))
        _fst.add_arc(fw_ou_state, fst.Arc(consonant_id, consonant_id, one, fw_ou_c_state))
        _fst.add_arc(bw_ai_c_state, fst.Arc(consonant_id, consonant_id, one, bw_ai_c_state))
        _fst.add_arc(bw_ou_c_state, fst.Arc(consonant_id, consonant_id, one, bw_ou_c_state))
        _fst.add_arc(fw_ei_c_state, fst.Arc(consonant_id, consonant_id, one, fw_ei_c_state))
        _fst.add_arc(fw_ou_c_state, fst.Arc(consonant_id, consonant_id, one, fw_ou_c_state))
    
    
    return _fst

p_fst = possessive_fst()
# Save and visualize the FST using external tools
p_fst.write('possessive.fst')
!fstdraw possessive.fst | dot -Tpng > possessive.png

FATAL: FstDrawer: Integer -1 is not mapped to any textual symbol, symbol table = <unspecified>, destination = stdout
Error: <stdin>: syntax error in line 48 scanning a quoted string (missing endquote? longer than 16384?)
String starting:"+1ps:


In [None]:
import math
def possessive_word(word, plural_fst, specifier):
    front_vowels = set('eiöü')
    back_vowels = set('aıou')

    # Convert word to FST-friendly format
    char_sequence = []
    for char in word:
        if char in front_vowels:
            char_sequence.append('FW')
        elif char in back_vowels:
            char_sequence.append('BW')
        else:
            char_sequence.append('<eps>')  # Epsilon for consonants

    char_sequence.append(specifier)

    # Transduce the sequence
    transduced_sequence = transduce_sequence_det(plural_fst, char_sequence)

    # Decode and reconstruct the word
    reconstructed_word = []
    original_word_index = 0
    for symbol in transduced_sequence:
        if isinstance(symbol, bytes):
            symbol = symbol.decode('utf-8')

        if symbol == 'FW' or symbol == 'BW':
            if original_word_index < len(word):
                reconstructed_word.append(word[original_word_index])
                original_word_index += 1
            else:
                # Handle case where the word has been fully traversed
                reconstructed_word.append(symbol)
        elif symbol != '<eps>':
            reconstructed_word.append(symbol)

    return ''.join(reconstructed_word)

p_fst = possesive_fst()
word = 'kalem'  # Example word in Turkish
p_word = possessive_word(word, p_fst, '+1ps')
print(p_word)  # Should output the plural form of 'elma'

# Save and visualize the FST using external tools
p_fst.write('possessive.fst')
!fstdraw possessive.fst | dot -Tpng > possessive.png

In [None]:
# Create the possessive FST
possessive_fst = create_possessive_fst()
# Test the FST with a word
word = 'kitap'
possessive_word = add_possessive_suffix(word, possessive_fst)
print("sdfsd" + possessive_word)  # Should output 'kitapım' or 'kitabım' depending on the rules defined

possessive_fst.write('possessive.fst')
!fstdraw possessive.fst | dot -Tpng > possessive.png