In [1]:
import json

In [23]:
f = open('affixes.json')
affixes = json.load(f)
f.close()

In [24]:
affixes['prefixes'][:3]

[{'realization': 'o',
  'meaning': 'neuter prefix',
  'environments': ['structural'],
  'argument': 'basic'},
 {'realization': 'ga',
  'meaning': 'neuter prefix',
  'environments': ['structural'],
  'argument': 'basic'},
 {'realization': 'ag',
  'meaning': 'my (possessive)',
  'environments': ['structural', 'possessive'],
  'argument': 'basic'}]

In [25]:
def flatten_orthography(word):
    new_word = ""
    for c in word:
        if c in "eęéę́e̱ę̱":
            new_word += 'e'
        elif c in "oǫóǫ́o̱ǫ̱":
            new_word += 'o'
        elif c in "aáa̱":
            new_word += 'a'
        elif c in "iíi̱":
            new_word += 'i'
        elif c == ':':
            continue
        else:
            new_word += c
    return new_word

def parse_options(word):
    possibilities = [""]
    i = 0
    while i < len(word):
        if word[i] == '(':
            end = word.find(')', i+1)
            optional = word[i+1:end]
            new_poss = []
            for poss in possibilities:
                new_poss.append(poss + optional)
                new_poss.append(poss)
            possibilities = new_poss
            i = end + 1
        else:
            possibilities = [poss + word[i] for poss in possibilities]
            i += 1
    return possibilities

In [26]:
def load_roots(filename):
    roots = []
    f = open(filename)
    contents = f.readlines()
    f.close()
    for line in contents:
        possibilities = [""]
        if len(line) < 3:
            continue
        root, definition = [part.strip() for part in line.split('>')]
        i = 0
        while i < len(root):
            if root[i] == '(':
                end = root.find(')', i+1)
                optional = root[i+1:end]
                new_poss = []
                for poss in possibilities:
                    new_poss.append(poss + optional)
                    new_poss.append(poss)
                possibilities = new_poss
                i = end + 1
            else:
                possibilities = [poss + root[i] for poss in possibilities]
                i += 1
        for poss in possibilities:
            roots.append((poss, definition))
    return roots

roots = load_roots('roots_simplified.txt')
roots[:5]

[("'ahdra", 'basket'),
 ("'ahdr", 'basket'),
 ('ahdra', 'basket'),
 ('ahdr', 'basket'),
 ("'ahdronih", 'witch hazel')]

In [27]:
root_dict = {'nouns' : [], 'verbs' : []}
for (realization, meaning) in roots:
    if meaning[-1] == 'V':
        root_dict['verbs'].append({
            'realization' : realization,
            'meaning' : meaning[:-1],
            'category' : 'transitive'
        })
    else:
        root_dict['nouns'].append({
            'realization' : realization,
            'meaning' : meaning,
            'category' : 'structural'
        })
f = open('roots.json', 'w')
f.write(json.dumps(root_dict, indent=2))
f.close()

In [28]:
f = open('roots.json')
roots = json.load(f)
f.close()

In [29]:
f = open('roots_exp.json')
roots = json.load(f)
f.close()

In [30]:
class ConstrainedQueue():
    def __init__(self, max_size, metric=max, default=float('inf')):
        self.max_size = max_size
        self.entries = [0] * max_size
        self.scores = [float('inf')] * max_size
        self.metric = metric
        self.worst = self.scores.index(metric(self.scores))
    
    def add(self, entry, value):
        # print(self.entries, self.scores, entry, value)
        if self.metric(value, self.scores[self.worst]) == value:
            return
        self.entries[self.worst] = entry
        self.scores[self.worst] = value
        self.worst = self.scores.index(self.metric(self.scores))

In [31]:
from Levenshtein import distance as ldist

similarity_metric = lambda gold, word : ldist(gold, word)

In [32]:
def best_match_ldist(gold, word):
    gold = flatten_orthography(gold)
    possibilities = parse_options(word)
    score = min([similarity_metric(gold, flatten_orthography(poss)) for poss in possibilities])
    return score

In [33]:
def find_affixes(word, affixes, roots):
    attributive_prefixes = [prefix for prefix in affixes['prefixes'] if prefix['argument'] == 'attributive']
    basic_prefixes = [prefix for prefix in affixes['prefixes'] if prefix['argument'] == 'basic']
    attributive_suffixes = [suffix for suffix in affixes['suffixes'] if suffix['argument'] == 'attributive']
    basic_suffixes = [suffix for suffix in affixes['suffixes'] if suffix['argument'] == 'basic']
    # later implementation for verbal prepronom, pronom, and aspect suffixes
    
    possibilities = [{
        'charleft' : word,
        'meanings' : []
    }]
    
    # print(possibilities)
    # print('finding attributive_prefixes')
    temp = possibilities.copy()
    for possibility in possibilities:
        word = possibility['charleft']
        for prefix in attributive_prefixes:
            if word.startswith(prefix['realization']):
                if prefix['realization'][-1] == 'o':
                    new_poss = {
                        'charleft' : 'a' + word[len(prefix['realization']):],
                        'meanings' : possibility['meanings'] + [prefix]
                    }
                    temp.append(new_poss)
                new_poss = {
                    'charleft' : word[len(prefix['realization']):],
                    'meanings' : possibility['meanings'] + [prefix]
                }
                temp.append(new_poss)
    possibilities = temp.copy()
    # print(possibilities)
    # print('finding basic_prefixes')
    for possibility in possibilities:
        word = possibility['charleft']
        for prefix in basic_prefixes:
            if word.startswith(prefix['realization']):
                new_poss = {
                    'charleft' : word[len(prefix['realization']):],
                    'meanings' : possibility['meanings'] + [prefix]
                }
                temp.append(new_poss)
    
    possibilities = temp.copy()
    # print(possibilities)
    # print('finding attributive_suffixes')
    for possibility in possibilities:
        word = possibility['charleft']
        for suffix in attributive_suffixes:
            if word.endswith(suffix['realization']):
                new_poss = {
                    'charleft' : word[:len(word) - len(suffix['realization'])],
                    'meanings' : possibility['meanings'] + [suffix]
                }
                temp.append(new_poss)
    possibilities = temp.copy()
    # print(possibilities)
    # print('finding basic_suffixes')
    for possibility in possibilities:
        word = possibility['charleft']
        for suffix in basic_suffixes:
            if word.endswith(suffix['realization']):
                new_poss = {
                    'charleft' : word[:len(word) - len(suffix['realization'])],
                    'meanings' : possibility['meanings'] + [suffix]
                }
                temp.append(new_poss)
    
    possibilities = temp.copy()
    # print(possibilities)
    # print('finding roots')
    for possibility in possibilities:
        word = possibility['charleft']
        root_possibilities = ConstrainedQueue(5)
        for root in roots['nouns']:
            root_possibilities.add(root, best_match_ldist(word, root['realization']))
        for root in roots['verbs']:
            realization = root['realization'] + 'hsr' if root['realization'][-1] in 'aeiou' else root['realization'] + 'r'
            root_possibilities.add(root, best_match_ldist(word, realization))
        
        for idx in range(root_possibilities.max_size):
            root = root_possibilities.entries[idx]
            similarity = root_possibilities.scores[idx]
            new_poss = {
                'similarity' : similarity,
                'meanings' : possibility['meanings'] + [root]
            }
            temp.append(new_poss)
    
    
    possibilities = temp.copy()
    return possibilities

In [34]:
def print_legible(possibilities):
    for poss in possibilities:
        # print(poss)
        breakup = []
        meanings = []
        if not 'similarity' in poss:
            continue
        for meaning in poss['meanings']:
            breakup.append(meaning['realization'])
            meanings.append(meaning['meaning'])
        # if len(poss['charleft']) > 0:
        #     breakup.append(poss['charleft'])
        #     meanings.append('<unmatched>')
        print(str(breakup) + ' ('+ str(poss['similarity']) + ')')
        print(meanings)
        print()

In [35]:
word = "ga'dai'e"
word = "onehe'"

In [36]:
possibilities = find_affixes(word, affixes, roots)

In [37]:
print_legible(possibilities)

['nÄ™h(Ä™)'] (5)
['corn']

['neh(wa)'] (3)
['skin, leather']

["(')nehs(a)"] (3)
['sand']

["(')nih"] (4)
['father']



TypeError: 'int' object is not subscriptable

In [42]:
def constrain_possibilities(possibilities):
    new_poss = []
    for poss in possibilities:
        if not 'similarity' in poss:
            continue
        envs = []
        # print(poss)
        for meaning in poss['meanings']:
            if 'environments' in meaning:
                envs += meaning['environments']
        root_category = poss['meanings'][-1]['category']
        # print('structural' in envs)
        # print(sum([meaning['argument'] == 'basic' for meaning in poss['meanings'] if 'argument' in meaning]))
        if root_category == 'structural' and sum([meaning['argument'] == 'basic' 
                for meaning in poss['meanings'] 
                if 'argument' in meaning]) < 2:
            continue # structural needs a basic prefix and suffix
        new_poss.append(poss)
    return new_poss

In [43]:
print_legible(constrain_possibilities(possibilities))

['o', "e'", 'neh'] (0)
['neuter prefix', 'noun suffix', 'corn']

['o', "e'", 'nih'] (1)
['neuter prefix', 'noun suffix', 'father']

['o', "e'", 'nehs'] (1)
['neuter prefix', 'noun suffix', 'sand']

['o', "e'", 'neh'] (0)
['neuter prefix', 'noun suffix', 'skin, leather']

['o', "e'", 'nehe'] (1)
['neuter prefix', 'noun suffix', 'corn']



In [38]:
possibilities

[{'charleft': "onehe'", 'meanings': []},
 {'charleft': "nehe'",
  'meanings': [{'realization': 'o',
    'meaning': 'neuter prefix',
    'environments': ['structural'],
    'argument': 'basic'}]},
 {'charleft': 'oneh',
  'meanings': [{'realization': "e'",
    'meaning': 'noun suffix',
    'environments': ['structural'],
    'argument': 'basic'}]},
 {'charleft': 'neh',
  'meanings': [{'realization': 'o',
    'meaning': 'neuter prefix',
    'environments': ['structural'],
    'argument': 'basic'},
   {'realization': "e'",
    'meaning': 'noun suffix',
    'environments': ['structural'],
    'argument': 'basic'}]},
 {'similarity': 5,
  'meanings': [{'realization': 'nÄ™h(Ä™)',
    'meaning': 'corn',
    'category': 'structural'}]},
 {'similarity': 3,
  'meanings': [{'realization': 'neh(wa)',
    'meaning': 'skin, leather',
    'category': 'structural'}]},
 {'similarity': 3,
  'meanings': [{'realization': "(')nehs(a)",
    'meaning': 'sand',
    'category': 'structural'}]},
 {'similarity': 4

In [41]:
roots

{'nouns': [{'realization': "'ahdra",
   'meaning': 'basket',
   'category': 'structural'},
  {'realization': "'ahdr", 'meaning': 'basket', 'category': 'structural'},
  {'realization': 'ahdra', 'meaning': 'basket', 'category': 'structural'},
  {'realization': 'ahdr', 'meaning': 'basket', 'category': 'structural'},
  {'realization': "'ahdronih",
   'meaning': 'witch hazel',
   'category': 'structural'},
  {'realization': 'ahdronih',
   'meaning': 'witch hazel',
   'category': 'structural'},
  {'realization': "'gehohkwa'",
   'meaning': 'ammunition',
   'category': 'structural'},
  {'realization': "gehohkwa'",
   'meaning': 'ammunition',
   'category': 'structural'},
  {'realization': "'akso'",
   'meaning': 'machine gun',
   'category': 'structural'},
  {'realization': "akso'", 'meaning': 'machine gun', 'category': 'structural'},
  {'realization': "'ahsaw",
   'meaning': 'chest, cough',
   'category': 'structural'},
  {'realization': "'ahsa",
   'meaning': 'chest, cough',
   'category': 