In [27]:
import numpy as np

words = []

# 10k woerter von https://www.mit.edu/~ecprice/wordlist.10000
with open("words.txt") as file:
    words = file.readlines()
    
words = list(map(lambda w : w.replace("\n", ""), words))

In [44]:
def distance_letters(first, second):
    
    if first == second:
        return 0
    
    vowels = "aeiou"
    
    commonly_confused = ["dt", "pb", "sz", "vw", "ae", "ou", "mn", "iy", "gk"]
    
    if (first+second) in commonly_confused or (second+first) in commonly_confused:
        return 0.5
    
    if (first in vowels and second in vowels) or (first not in vowels and second not in vowels):
        return 1
    else:
        return 3

In [45]:
def distance_words(first, second):
    
    first = first.lower()
    second = second.lower()
    
    n = len(first)
    m = len(second)
    
    gap_penalty = 2   
    
    opt = np.zeros((n + 1, m + 1))
    
    # keine buchstaben im zweiten string mehr, aber noch im ersten
    opt[1:n+ 1, 0] = [i * gap_penalty for i in range(1, n + 1)]
    # keine buchstaben im ersten string mehr, aber noch i im zweiten
    opt[0, 1:m + 1] = [i * gap_penalty for i in range(1, m + 1)]
    
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            
            c_first = first[i-1]
            c_second = second[j-1]
            
            matchBoth = distance_letters(c_first, c_second) + opt[i-1][j-1]
            
            skipFirst = gap_penalty + opt[i-1][j]
            
            skipSecond = gap_penalty + opt[i][j-1]
            
            opt[i][j] = min(matchBoth, skipFirst, skipSecond)
            
    return opt[n][m]            

In [46]:
def find_k_most_similar(word, words, k=10):
    
    word_similarities = dict()
    
    for other_word in words:
        word_similarities[other_word] = distance_words(word, other_word)
        
    # sortiere aufsteigend nach distanz (d.h. aehnlichste zuerst)
    most_similar_words = sorted(word_similarities.items(), key=lambda item: item[1])[:k]
    
    return most_similar_words

In [47]:
find_k_most_similar("fruit", words)

[('fruit', 0.0),
 ('fluid', 1.5),
 ('fruits', 2.0),
 ('float', 2.5),
 ('floyd', 2.5),
 ('fraud', 2.5),
 ('trout', 2.5),
 ('brain', 3.0),
 ('broad', 3.0),
 ('craig', 3.0)]

In [48]:
find_k_most_similar("hand", words)

[('hand', 0.0),
 ('band', 1.0),
 ('hang', 1.0),
 ('hans', 1.0),
 ('hard', 1.0),
 ('land', 1.0),
 ('rand', 1.0),
 ('sand', 1.0),
 ('bend', 1.5),
 ('cant', 1.5)]

In [49]:
find_k_most_similar("crown", words)

[('crown', 0.0),
 ('brown', 1.0),
 ('crowd', 1.0),
 ('grown', 1.0),
 ('crops', 2.0),
 ('cross', 2.0),
 ('drawn', 2.0),
 ('grows', 2.0),
 ('known', 2.0),
 ('shown', 2.0)]