In [4]:
import Trie
import random
import matplotlib 

In [5]:
morpheme_sizes = range(1,6)
morpheme_count = range(2, 50)
string_lengths = range(20,1000, 20)

In [6]:
#Returns a random string of length using alphabet
def random_string(length, alphabet):
    rv = ""
    for i in range(length):
        rv += str(random.choice(alphabet))
    return rv

#Generates a morpheme set of size count with random lengths between min_length and max_length from symbols
def generate_morphemes(symbols, count, min_length = 1, max_length = 6):
    morphemes = set()
    counter = 0 + count
    
    #If the alphabet is too small to generate count morphemes it will generate 
    #the max amount of morphemes possible
    if count > (len(symbols)**max_length): counter = len(symbols)**max_length

    while counter > 0:
        morpheme_len = random.choice(range(min_length,max_length))
        morpheme = random_string(morpheme_len, symbols)
        if morpheme not in morphemes:
            morphemes.add(morpheme)
            counter -= 1
    return morphemes
        
def generate_corpus(string_length, count, alphabet = ["1","0"], min_length = 1, max_length = 6):
    corpus = ""
    test = ""
    
    morphemes = generate_morphemes(alphabet, count, min_length, max_length)

    used = set()
    while len(corpus) < string_length:
        morpheme = random.choice(list(morphemes))
        if morpheme not in used:
            used.add(morpheme)
        corpus += morpheme
        
    for morpheme in used:
        test += (morpheme + "-")
    test = test[:-1]
    
    return test, corpus

test, corpus = generate_corpus(4, 3, max_length = 3)
print(corpus)
print(test)

10000
00-1


In [7]:
def convergence_check (string1, string2):
    counter = 0
    length = min(len(string1), len(string2))
    for i in range(length):
        if string1[i] == string2[i]:
            counter += 1
    return counter/length

In [8]:
corpora = dict()
tests = dict()
truths = dict()
for count in morpheme_count:
    for string_length in string_lengths:
        run = (count, string_length)
        truth, corpus = generate_corpus(string_length, count)
        corpora[run] = corpus
        truths[run] = truth
        test = ""
        for char in truth:
            if char != "-": test += char
        tests[run] = test
print(tests[2,20])
print(truths[2,20])
print(corpora[2,20])

110110
11011-0
0011011110111101111011


In [10]:
sigmoid_convergence_results = dict()
sigmoid_translated_convergence_results = dict()
simple_convergence_results = dict()

for count in morpheme_count:
    for string_length in string_lengths:
        counter = 0
        run = (count, string_length)
        sigmoid_tree = Trie.Trie("sigmoid")
        sigmoid_translated_tree = Trie.Trie("sigmoid_translated")
        simple_tree = Trie.Trie("simple")
        
        while counter < 1000:
            counter += 1
            sigmoid_tree.train(corpora[run])
            result = convergence_check(sigmoid_tree.run(tests[run]), truths[run])
            if result > 0.9:
                break
        sigmoid_convergence_results[run] = counter
        
        while counter < 1000:
            counter += 1
            sigmoid_translated_tree.train(corpora[run])
            result = convergence_check(sigmoid_translated_tree.run(tests[run]), truths[run])
            if result > 0.9:
                break
        sigmoid_translated_convergence_results[run] = counter
        
        while counter < 1000:
            counter += 1
            simple_tree.train(corpora[run])
            result = convergence_check(simple_tree.run(tests[run]), truths[run])
            if result > 0.9:
                break
        simple_convergence_results[run] = counter

KeyboardInterrupt: 