In [9]:
import numpy as np
import pandas as pd

In [10]:
def vectorize_terms(terms):
    terms = [term.lower() for term in terms]
    terms = [np.array(list(term)) for term in terms]
    terms = [np.array([ord(char) for char in term]) 
                for term in terms]
    return terms

In [11]:
root = 'Believe'
term1 = 'beleive'
term2 = 'bargain'
term3 = 'Elephant'    

terms = [root, term1, term2, term3]
terms

['Believe', 'beleive', 'bargain', 'Elephant']

In [12]:
# Character vectorization
term_vectors = vectorize_terms(terms)

# show vector representations
vec_df = pd.DataFrame(term_vectors, index=terms)
print(vec_df)

            0    1    2    3    4    5    6      7
Believe    98  101  108  105  101  118  101    NaN
beleive    98  101  108  101  105  118  101    NaN
bargain    98   97  114  103   97  105  110    NaN
Elephant  101  108  101  112  104   97  110  116.0


Explain

In [7]:
terms = 'bargain'
terms = [term.lower() for term in terms]
print(terms)
terms = [np.array(list(term)) for term in terms]
print(terms)
terms = [np.array([ord(char) for char in term]) 
            for term in terms]
print(terms)

['b', 'a', 'r', 'g', 'a', 'i', 'n']
[array(['b'], dtype='<U1'), array(['a'], dtype='<U1'), array(['r'], dtype='<U1'), array(['g'], dtype='<U1'), array(['a'], dtype='<U1'), array(['i'], dtype='<U1'), array(['n'], dtype='<U1')]
[array([98]), array([97]), array([114]), array([103]), array([97]), array([105]), array([110])]


End

In [13]:
root_term = root
other_terms = [term1, term2, term3]

root_term_vec = vec_df[vec_df.index == root_term].dropna(axis=1).values[0]
other_term_vecs = [vec_df[vec_df.index == term].dropna(axis=1).values[0]
                      for term in other_terms]

In [15]:
root_term_vec

array([ 98, 101, 108, 105, 101, 118, 101])

In [14]:
other_term_vecs

[array([ 98, 101, 108, 101, 105, 118, 101]),
 array([ 98,  97, 114, 103,  97, 105, 110]),
 array([101., 108., 101., 112., 104.,  97., 110., 116.])]

In [16]:
def hamming_distance(u, v, norm=False):
    if u.shape != v.shape:
        raise ValueError('The vectors must have equal lengths.')
    return (u != v).sum() if not norm else (u != v).mean()

In [17]:
for term, term_vector in zip(other_terms, other_term_vecs):
    print('Hamming distance between root: {} and term: {} is {}'.format(root_term,
                                                                        term,
                                                                        hamming_distance(root_term_vec, 
                                                                                         term_vector, norm=False)))

Hamming distance between root: Believe and term: beleive is 2
Hamming distance between root: Believe and term: bargain is 6


ValueError: ignored

Explain

In [36]:
u=root_term_vec #Believe
v=other_term_vecs[0] #believe
print(u)
print(v)
print((u != v).sum()) # 2 ký tự khác nhau
print((u != v).mean())

[ 98 101 108 105 101 118 101]
[ 98 101 108 101 105 118 101]
2
0.2857142857142857


In [37]:
u=root_term_vec #Believe
v=other_term_vecs[1] #bargain
print(u)
print(v)
print((u != v).sum()) # 6 ký tự khác nhau
print((u != v).mean())

[ 98 101 108 105 101 118 101]
[ 98  97 114 103  97 105 110]
6
0.8571428571428571


End

In [18]:
for term, term_vector in zip(other_terms, other_term_vecs):
    print('Normalized Hamming distance between root: {} and term: {} is {}'.format(root_term,
                                                                                   term,
                                                                                   round(hamming_distance(root_term_vec, 
                                                                                         term_vector, norm=True),
                                                                                         2)
                                                                                   ))

Normalized Hamming distance between root: Believe and term: beleive is 0.29
Normalized Hamming distance between root: Believe and term: bargain is 0.86


ValueError: ignored

In [19]:
def manhattan_distance(u, v, norm=False):
    if u.shape != v.shape:
        raise ValueError('The vectors must have equal lengths.')
    return abs(u - v).sum() if not norm else abs(u - v).mean()

In [20]:
for term, term_vector in zip(other_terms, other_term_vecs):
    print('Manhattan distance between root: {} and term: {} is {}'.format(root_term,
                                                                          term,
                                                                          manhattan_distance(root_term_vec, 
                                                                                             term_vector, norm=False)))

Manhattan distance between root: Believe and term: beleive is 8
Manhattan distance between root: Believe and term: bargain is 38


ValueError: ignored

In [21]:
for term, term_vector in zip(other_terms, other_term_vecs):
    print('Normalized Manhattan distance between root: {} and term: {} is {}'.format(root_term,
                                                                                     term,
                                                                                     round(manhattan_distance(root_term_vec, 
                                                                                           term_vector, norm=True),
                                                                                           2)
                                                                                     ))

Normalized Manhattan distance between root: Believe and term: beleive is 1.14
Normalized Manhattan distance between root: Believe and term: bargain is 5.43


ValueError: ignored

In [22]:
def euclidean_distance(u,v):
    if u.shape != v.shape:
        raise ValueError('The vectors must have equal lengths.')
    distance = np.sqrt(np.sum(np.square(u - v)))
    return distance

In [23]:
for term, term_vector in zip(other_terms, other_term_vecs):
    print('Euclidean distance between root: {} and term: {} is {}'.format(root_term,
                                                                          term,
                                                                          round(euclidean_distance(root_term_vec, 
                                                                                                   term_vector),
                                                                                2)
                                                                          ))

Euclidean distance between root: Believe and term: beleive is 5.66
Euclidean distance between root: Believe and term: bargain is 17.94


ValueError: ignored

In [24]:
import copy
import pandas as pd

def levenshtein_edit_distance(u, v):
    # convert to lower case
    u = u.lower()
    v = v.lower()
    # base cases
    if u == v: return 0
    elif len(u) == 0: return len(v)
    elif len(v) == 0: return len(u)
    # initialize edit distance matrix
    edit_matrix = []
    # initialize two distance matrices 
    du = [0] * (len(v) + 1)
    dv = [0] * (len(v) + 1)
    # du: the previous row of edit distances
    for i in range(len(du)):
        du[i] = i
    # dv : the current row of edit distances    
    for i in range(len(u)):
        dv[0] = i + 1
        # compute cost as per algorithm
        for j in range(len(v)):
            cost = 0 if u[i] == v[j] else 1
            dv[j + 1] = min(dv[j] + 1, du[j + 1] + 1, du[j] + cost)
        # assign dv to du for next iteration
        for j in range(len(du)):
            du[j] = dv[j]
        # copy dv to the edit matrix
        edit_matrix.append(copy.copy(dv))
    # compute the final edit distance and edit matrix    
    distance = dv[len(v)]
    edit_matrix = np.array(edit_matrix)
    edit_matrix = edit_matrix.T
    edit_matrix = edit_matrix[1:,]
    edit_matrix = pd.DataFrame(data=edit_matrix,
                               index=list(v),
                               columns=list(u))
    return distance, edit_matrix

In [25]:
for term in other_terms:
    edit_d, edit_m = levenshtein_edit_distance(root_term, term)
    print('Computing distance between root: {} and term: {}'.format(root_term,
                                                                    term))
    print('Levenshtein edit distance is {}'.format(edit_d))
    print('The complete edit distance matrix is depicted below')
    print(edit_m)
    print('-'*30)

Computing distance between root: Believe and term: beleive
Levenshtein edit distance is 2
The complete edit distance matrix is depicted below
   b  e  l  i  e  v  e
b  0  1  2  3  4  5  6
e  1  0  1  2  3  4  5
l  2  1  0  1  2  3  4
e  3  2  1  1  1  2  3
i  4  3  2  1  2  2  3
v  5  4  3  2  2  2  3
e  6  5  4  3  2  3  2
------------------------------
Computing distance between root: Believe and term: bargain
Levenshtein edit distance is 6
The complete edit distance matrix is depicted below
   b  e  l  i  e  v  e
b  0  1  2  3  4  5  6
a  1  1  2  3  4  5  6
r  2  2  2  3  4  5  6
g  3  3  3  3  4  5  6
a  4  4  4  4  4  5  6
i  5  5  5  4  5  5  6
n  6  6  6  5  5  6  6
------------------------------
Computing distance between root: Believe and term: Elephant
Levenshtein edit distance is 7
The complete edit distance matrix is depicted below
   b  e  l  i  e  v  e
e  1  1  2  3  4  5  6
l  2  2  1  2  3  4  5
e  3  2  2  2  2  3  4
p  4  3  3  3  3  3  4
h  5  4  4  4  4  4  4
a  6 

In [26]:
def boc_term_vectors(word_list):
    word_list = [word.lower() for word in word_list]
    unique_chars = np.unique(
                        np.hstack([list(word) 
                        for word in word_list]))
    word_list_term_counts = [{char: count 
                                  for char, count in np.stack(
                                                         np.unique(list(word), 
                                                                   return_counts=True),
                                                         axis=1)}
                                 for word in word_list]
    
    boc_vectors = [np.array([int(word_term_counts.get(char, 0)) 
                            for char in unique_chars])
                   for word_term_counts in word_list_term_counts]
    return list(unique_chars), boc_vectors

In [27]:
# Bag of characters vectorization
import pandas as pd

feature_names, feature_vectors = boc_term_vectors(terms)
boc_df = pd.DataFrame(feature_vectors, columns=feature_names, index=terms)
print(boc_df)

          a  b  e  g  h  i  l  n  p  r  t  v
Believe   0  1  3  0  0  1  1  0  0  0  0  1
beleive   0  1  3  0  0  1  1  0  0  0  0  1
bargain   2  1  0  1  0  1  0  1  0  1  0  0
Elephant  1  0  2  0  1  0  1  1  1  0  1  0


In [28]:
root_term_boc = boc_df[vec_df.index == root_term].values[0]
other_term_bocs = [boc_df[vec_df.index == term].values[0]
                      for term in other_terms]

In [29]:
def cosine_distance(u, v):
    distance = 1.0 - (np.dot(u, v) / 
                        (np.sqrt(sum(np.square(u))) * np.sqrt(sum(np.square(v))))
                     )
    return distance

In [30]:
for term, boc_term in zip(other_terms, other_term_bocs):
    print('Analyzing similarity between root: {} and term: {}'.format(root_term,
                                                                      term))
    distance = round(cosine_distance(root_term_boc, boc_term), 2)
    similarity = round(1 - distance, 2)                                                           
    print('Cosine distance  is {}'.format(distance))
    print('Cosine similarity  is {}'.format(similarity))
    print('-'*40)

Analyzing similarity between root: Believe and term: beleive
Cosine distance  is -0.0
Cosine similarity  is 1.0
----------------------------------------
Analyzing similarity between root: Believe and term: bargain
Cosine distance  is 0.82
Cosine similarity  is 0.18
----------------------------------------
Analyzing similarity between root: Believe and term: Elephant
Cosine distance  is 0.39
Cosine similarity  is 0.61
----------------------------------------
