In [1]:
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd

In [9]:
def convert_tag(tag):
    """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
    
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None

In [21]:
def doc_to_synsets(doc):
    """
    Returns a list of synsets in document.

    Tokenizes and tags the words in the document doc.
    Then finds the first synset for each word/tag combination.
    If a synset is not found for that combination it is skipped.

    Args:
        doc: string to be converted

    Returns:
        list of synsets

    Example:
        doc_to_synsets('Fish are nvqjp friends.')
        Out: [Synset('fish.n.01'), Synset('be.v.01'), Synset('friend.n.01')]
    """
    
    # Tokenizes and tags the words in the document doc
    #token = nltk.word_tokenize(doc)
    tag = nltk.pos_tag(doc)
    #converto to wordnet tag
    wntag = [(i[0], convert_tag(i[1])) for i in tag]
    
    #Then finds the first synset for each word/tag combination.
    #If a synset is not found for that combination it is skipped.
    synset = [wn.synsets(i, z)[0] for i, z in wntag if len(wn.synsets(i, z)) > 0]
    
    return synset

In [10]:
def similarity_score(s1, s2):
    """
    Calculate the normalized similarity score of s1 onto s2

    For each synset in s1, finds the synset in s2 with the largest similarity value.
    Sum of all of the largest similarity values and normalize this value by dividing it by the
    number of largest similarity values found.

    Args:
        s1, s2: list of synsets from doc_to_synsets

    Returns:
        normalized similarity score of s1 onto s2

    Example:
        synsets1 = doc_to_synsets('I like cats')
        synsets2 = doc_to_synsets('I like dogs')
        similarity_score(synsets1, synsets2)
        Out: 0.73333333333333339
    """    
    list = []
    # For each synset in s1
    for i1 in s1:
        # finds the synset in s2 with the largest similarity value
        r = []
        scores = [x for x in [i1.path_similarity(i2) for i2 in s2] if x is not None]
        if scores:
            list.append(max(scores))

    # normalize this value by dividing it by the number of largest similarity values found
    similarity = sum(list)/len(list)

    return similarity

In [11]:
def document_path_similarity(doc1, doc2):
    """Finds the symmetrical similarity between doc1 and doc2"""

    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)

    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

In [17]:
doc1 = nltk.corpus.gutenberg.words('austen-emma.txt')

In [18]:
doc2 = nltk.corpus.gutenberg.words('austen-sense.txt')

In [None]:
document_path_similarity(doc1, doc2)

In [None]:
import gensim
from sklearn.feature_extraction.text import CountVectorizer