Copyright 2021 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Character to Vector Example

## Function: char2vec
This function: <br>
1) Counts the characters in the word <br>
2) Computes a set of unique characters <br>
3) Computes the length of the vector in "character space"

In [None]:
def char2vec(word):
    from collections import Counter
    from math import sqrt

    # count the characters in word
    cw = Counter(word)
    # precomputes a set of the unique characters
    sw = set(cw)
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))

    # return a tuple
    return cw, sw, lw

## Function: cosdis
This function: Takes the cosine distance in character-space between two char2vec vectors.

In [None]:
def cosdis(v1, v2):
    # which characters are common to the two words?
    common = v1[1].intersection(v2[1])
    # by definition of cosine distance we have
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]

##Display Code 

In [None]:
#Our sample "words"
a = 'TheBrownFoxJumpsOverTheLazyDog'
b = 'TheWhiteFoxJumpsOverTheLazyDog'
c = 'SupermanJumpsOverTheTallBuilding'
wordlist = [a,b,c]

#For each of our words, create and print a dictionary describing the contained characters
#We use a dictionary instead of a sparse matrix to describe the characters, however the concept is identical
char2vecdict = {}
for word in wordlist:
  char2vecdict[word] = char2vec(word)
  print(word)
  print(char2vecdict[word], end = '\n\n')

#Find the cosine distance between each of the 3 vectors created above
#Similar senteces will have higher scores, ranging from 0-1.
print ("\n\nCosine Distance between", a, "and", b,"=", cosdis(char2vecdict[a],char2vecdict[b]))
print ("\nCosine Distance between", b, "and", c,"=", cosdis(char2vecdict[b],char2vecdict[c]))
print ("\nCosine Distance between", c, "and", a,"=", cosdis(char2vecdict[c],char2vecdict[a]))
print ("\nCosine Distance between", a, "and", a,"=", cosdis(char2vecdict[a],char2vecdict[a]))


TheBrownFoxJumpsOverTheLazyDog
(Counter({'e': 3, 'o': 3, 'T': 2, 'h': 2, 'r': 2, 'B': 1, 'w': 1, 'n': 1, 'F': 1, 'x': 1, 'J': 1, 'u': 1, 'm': 1, 'p': 1, 's': 1, 'O': 1, 'v': 1, 'L': 1, 'a': 1, 'z': 1, 'y': 1, 'D': 1, 'g': 1}), {'n', 'T', 'v', 'w', 'h', 'a', 'z', 'F', 's', 'e', 'D', 'B', 'y', 'o', 'r', 'L', 'x', 'm', 'g', 'u', 'O', 'p', 'J'}, 6.928203230275509)

TheWhiteFoxJumpsOverTheLazyDog
(Counter({'e': 4, 'h': 3, 'T': 2, 'o': 2, 'W': 1, 'i': 1, 't': 1, 'F': 1, 'x': 1, 'J': 1, 'u': 1, 'm': 1, 'p': 1, 's': 1, 'O': 1, 'v': 1, 'r': 1, 'L': 1, 'a': 1, 'z': 1, 'y': 1, 'D': 1, 'g': 1}), {'T', 'W', 'v', 'h', 'a', 'z', 'F', 's', 'e', 'D', 'y', 'o', 'r', 'L', 'x', 'm', 'g', 'u', 'O', 'i', 't', 'p', 'J'}, 7.211102550927978)

SupermanJumpsOverTheTallBuilding
(Counter({'u': 3, 'e': 3, 'l': 3, 'p': 2, 'r': 2, 'm': 2, 'a': 2, 'n': 2, 'T': 2, 'i': 2, 'S': 1, 'J': 1, 's': 1, 'O': 1, 'v': 1, 'h': 1, 'B': 1, 'd': 1, 'g': 1}), {'r', 'a', 'n', 'T', 'm', 's', 'e', 'u', 'O', 'v', 'B', 'i', 'd', 'S', 'g',