## Document Similarity

###### Build function

In [2]:
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd


def convert_tag(tag):
    """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
   
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None


def doc_to_synsets(doc:str):
    """
    Returns a list of synsets in document.
    Example:
        doc_to_synsets('Fish are nvqjp friends.')
        Out: [Synset('fish.n.01'), Synset('be.v.01'), Synset('friend.n.01')]
    """
    
    wordTokens=nltk.word_tokenize(doc)
    wordPOStags=nltk.pos_tag(wordTokens)
    temp=[wn.synsets(token, convert_tag(wordPOStag)) for token,wordPOStag in wordPOStags]
    result=[]
    for i in range (len(temp)):
        if temp[i]:
            result+=[temp[i][0]]
    return result


def similarity_score(s1, s2):
    """
    Calculate the normalized similarity score of s1 onto s2

    Example:
        synsets1 = doc_to_synsets('I like cats')
        synsets2 = doc_to_synsets('I like dogs')
        similarity_score(synsets1, synsets2)
        Out: 0.73333333333333339
    """  
    s=[]
    for i in s1:
        scores=[x for x in [i.path_similarity(j) for j in s2]if x is not None]
        if scores:
            s.append(max(scores))
    
    return sum(s)/len(s)


def document_path_similarity(doc1, doc2):
    """Finds the symmetrical similarity between doc1 and doc2"""

    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)

    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

###### Test function

In [4]:
txt=''
for message in pd.read_csv('../data/4.Tweet data after clear spams.csv').text: txt+=str(message)
message_to_compare='you should buy Amazon stock'
print('The similarity of all messages and "'+message_to_compare+'" is:', document_path_similarity(txt,message_to_compare))

The similarity of all messages and "you should buy Amazon stock" is: 0.5724405856426191
