In [741]:
#DingDong

import string
import math
import pandas as pd
import numpy as np

corpus = ["this is the first document. The first document is a great story and it is not boring", 
          "This is the second document. The second document is also a great story but the first is better ", 
          "this was the third document",
          "Rag-beast document document document document"]

In [766]:
def build_vectors(corpus=list,searchstring=str):
    
    #Build word counters for textcorpus 
    counters = []
    
    for document in corpus:
        counter = {}
        #Comprehension list:
        #Taking the corpus list, and creating seperate documents with one word per element. 
        document = [word.strip(string.punctuation).lower() for word in document.split()]
        
        
        #Loop, counting the dictionary values(counter), based on words present in the dictionary. 
        #Words as keys, number of presentations are values
        for word in document:
            if word not in counter:
                counter[word] = 1
            else:
                counter[word] += 1  
        #Dictionary is appended to list
        counters.append(counter)
    
    #Build word counter for searchstring 
    searchstring_counter = {}
    searchstring = [word.strip(string.punctuation).lower() for word in searchstring.split()]
    
    #Same loop as above, concerning the searchstring
    for word in searchstring:
            if word not in searchstring_counter:
                searchstring_counter[word] = 1
            else:
                searchstring_counter[word] += 1
    
    #Set searchstring as last element in counters for performing caluculations later
    counters.append(searchstring_counter)

    #Build combined dict
    #Combining the list of dict by taking a set(unique representation of keys), and unionize 
    combined_dict = set().union(*counters)
    
    #Build vectors
    #Building vectors in a comprehension list with conditions. 
    vector_list = []
    for c in range(len(counters)): # change 'i + counters[c][word]' to 'i + 1' to change counting
        vector = [i + counters[c][word] if word in counters[c] else i +0 for word in combined_dict] 
        vector_list.append(vector)

    return counters, combined_dict, vector_list




In [None]:
#Function for finding dot product, taking vector list as parameter
def dotproduct(vl):
    dp_dict = {}
    for vector in vl[:-1]:
        doc = 'Doc' + str((vl.index(vector)+1))
        dot_product = sum(n1 * n2 for n1, n2 in zip(vector, vl[-1]))
        dp_dict[doc] = dot_product
    return dp_dict


This function we take a list of vectors and compute the dotproduct between search document vector and the different corpus document vectors. 

$d$ = document vectors points, 

$s$ = searchdocument vector points,

Algebraic definition:

$$ d \cdot s = \displaystyle\sum_{i=1}^{n} d_i s_i = d_1 s_1 + d_2 s_2 + ... + d_n s_n $$


In [None]:
def euclideandistance(vl):
    ed_dict = {}
    for vector in vl[:-1]:
        doc = 'Doc' + str((vl.index(vector)+1))
        euclidean_distance = math.sqrt(sum(((n1 - n2)**2) for n1, n2 in zip(vector, vl[-1])))
        ed_dict[doc] = euclidean_distance
    return ed_dict

The euclidian function takes two vectors to compute the distance between the endpoints of different corpus documents vectors and the search document. 


$d$ = document vectors points, 

$s$ = searchdocument vector points,


$$ distance(d,s) = \sqrt{(d_1-s_1)^2+(d_2-s_2)^2+(d_n-s_n)^2} $$

In [None]:
def cosinesimilarity(vl):
    cs_dict = {}
    for vector in vl[:-1]:
        doc = 'Doc' + str((vl.index(vector)+1))
        cosine_similarity = sum(n1 * n2 for n1, n2 in zip(vector, vl[-1]))/(math.sqrt(sum(n ** 2 for n in vector)) * math.sqrt(sum(n ** 2 for n in vl[-1])))
        cs_dict[doc] = cosine_similarity
    return cs_dict

Cosine similarity takes to vectors to compute the angle between the vectors. A value of 1 means that the vectors are similar, where a value of 0 means the vectors are orthogonal 

In [767]:
# Set search string
counters, combined_dict, vector_list = build_vectors(corpus,'this was the third document')

#Visualize data
combined_dict_list = [combined_dict]

df_vectors = pd.DataFrame(vector_list)
df_counters = pd.DataFrame(counters)
df_combined = pd.DataFrame(combined_dict_list)

In [768]:
df_counters

Unnamed: 0,a,also,and,better,boring,but,document,first,great,is,it,not,rag-beast,second,story,the,third,this,was
0,1.0,,1.0,,1.0,,2,2.0,1.0,3.0,1.0,1.0,,,1.0,2.0,,1.0,
1,1.0,1.0,,1.0,,1.0,2,1.0,1.0,3.0,,,,2.0,1.0,3.0,,1.0,
2,,,,,,,1,,,,,,,,,1.0,1.0,1.0,1.0
3,,,,,,,4,,,,,,1.0,,,,,,
4,,,,,,,3,,,,,,,,,3.0,3.0,3.0,3.0


In [769]:
df_combined

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,boring,is,also,a,it,and,the,document,first,third,was,second,this,but,not,better,rag-beast,great,story


In [770]:
df_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1,3,0,1,1,1,2,2,2,0,0,0,1,0,1,0,0,1,1
1,0,3,1,1,0,0,3,2,1,0,0,2,1,1,0,1,0,1,1
2,0,0,0,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,3,3,0,3,3,0,3,0,0,0,0,0,0


In [771]:
df_dotproduct = pd.DataFrame([dotproduct(vector_list)])
df_dotproduct
#df_dotproduct.sort_values(by)

Unnamed: 0,Doc1,Doc2,Doc3,Doc4
0,15,18,15,12


In [772]:
df_euclideandistance = pd.DataFrame([euclideandistance(vector_list)])
df_euclideandistance

Unnamed: 0,Doc1,Doc2,Doc3,Doc4
0,6.63325,6.557439,4.472136,6.164414


In [773]:
df_cosinesimilarity = pd.DataFrame([cosinesimilarity(vector_list)])
df_cosinesimilarity

Unnamed: 0,Doc1,Doc2,Doc3,Doc4
0,0.415227,0.460179,1.0,0.433861


In [774]:
#validate  with numpy
doc1 = np.array(vector_list[0])
doc2 = np.array(vector_list[1])
doc3 = np.array(vector_list[2])
doc4 = np.array(vector_list[3])
sstr = np.array(vector_list[4])

dotproduct = np.dot(doc3,sstr)
euclideandistance = np.linalg.norm(doc3-sstr)
cosinesimilarity = np.dot(doc3, sstr) / (np.linalg.norm(doc3) * np.linalg.norm(sstr))

print(euclideandistance)
print(dotproduct)
print(cosinesimilarity)

4.47213595499958
15
0.9999999999999999
