## Text Similarity

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
# Import spacy
import spacy

## Text Similarity using SpaCy

In [15]:
# Load English model for tokenizer, tagger, parser, and NER
nlp = spacy.load('en_core_web_lg')
 
# Create documents
doc1 = nlp(u'I love pets.')
doc2 = nlp(u'I hate pets')
 
# Find similarity
print(doc1.similarity(doc2))

0.9041243947777828


## Jaccard Similarity

In [16]:
def jaccard_similarity(sent1, sent2):
    """Find text similarity using jaccard similarity"""
    
    # Tokenize sentences
    token1 = set(sent1.split())
    token2 = set(sent2.split())
     
    # intersection between tokens of two sentences    
    intersection_tokens = token1.intersection(token2)
    
    # Union between tokens of two sentences
    union_tokens=token1.union(token2)
    
    # jaccard Similarity
    sim_= float(len(intersection_tokens) / len(union_tokens))
    return sim_

In [17]:
# Call function
jaccard_similarity('I love pets.','I hate pets.')

0.5

## Cosine Similarity

In [18]:
# Let's import text feature extraction TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Import cosine_similarity metrics
from sklearn.metrics.pairwise import cosine_similarity

docs=['I love pets.','I hate pets.']
 
# Initialize TfidfVectorizer object
tfidf= TfidfVectorizer()
 
# Fit and transform the given data
tfidf_vector = tfidf.fit_transform(docs)

# compute similarity using cosine similarity
cos_sim=cosine_similarity(tfidf_vector, tfidf_vector)
print(cos_sim)

[[1.         0.33609693]
 [0.33609693 1.        ]]
