<a href="https://colab.research.google.com/github/damzC/nlp/blob/main/Count_Based_Word_Representation_for_Document_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is an implementation of **Count Based** Word Representations and Text/Document Similarity. It demonstrates the use of **Bag of Words**, **Count Vectorizer** and **TF-IDF** concepts for word representation, which lie at the core of NLP. It also showcases how these concepts can be leveraged to find *similarity between documents*.

# **Bag of Words**

In [None]:
review_1 = 'The movie was good and we really like it'
review_2 = 'the movie was good but the ending was boring'
review_3 = 'we did not like the movie as it was too lengthy'

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize

In [None]:
review_1_tokens = word_tokenize(review_1)
review_2_tokens = word_tokenize(review_2)
review_3_tokens = word_tokenize(review_3)

In [None]:
review_tokens = set(review_1_tokens).union(set(review_2_tokens)).union(set(review_3_tokens))

In [None]:
len(review_tokens)

18

In [None]:
review_tokens

{'The',
 'and',
 'as',
 'boring',
 'but',
 'did',
 'ending',
 'good',
 'it',
 'lengthy',
 'like',
 'movie',
 'not',
 'really',
 'the',
 'too',
 'was',
 'we'}

In [None]:
review1_dict = dict.fromkeys(review_tokens,0)

In [None]:
review1_dict

{'good': 0,
 'we': 0,
 'The': 0,
 'ending': 0,
 'but': 0,
 'was': 0,
 'lengthy': 0,
 'it': 0,
 'really': 0,
 'movie': 0,
 'boring': 0,
 'did': 0,
 'and': 0,
 'like': 0,
 'as': 0,
 'the': 0,
 'not': 0,
 'too': 0}

In [None]:
review1_dict = dict.fromkeys(review_tokens,0)
review2_dict = dict.fromkeys(review_tokens,0)
review3_dict = dict.fromkeys(review_tokens,0)

In [None]:
for token in review_1_tokens:
    review1_dict[token]+=1

In [None]:
review1_dict

{'good': 1,
 'we': 1,
 'The': 1,
 'ending': 0,
 'but': 0,
 'was': 1,
 'lengthy': 0,
 'it': 1,
 'really': 1,
 'movie': 1,
 'boring': 0,
 'did': 0,
 'and': 1,
 'like': 1,
 'as': 0,
 'the': 0,
 'not': 0,
 'too': 0}

In [None]:
for token in review_2_tokens:
    review2_dict[token]+=1
    
for token in review_3_tokens:
    review3_dict[token]+=1

In [None]:
reviews_Dict_DF = pd.DataFrame([review1_dict,review2_dict,review3_dict])

In [None]:
reviews_Dict_DF

Unnamed: 0,The,and,as,boring,but,did,ending,good,it,lengthy,like,movie,not,really,the,too,was,we
0,1,1,0,0,0,0,0,1,1,0,1,1,0,1,0,0,1,1
1,0,0,0,1,1,0,1,1,0,0,0,1,0,0,2,0,2,0
2,0,0,1,0,0,1,0,0,1,1,1,1,1,0,1,1,1,1


This is the Term Frequency Matrix (TFM)

# **Count Vectorizer**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
review_list = [review_1,review_2,review_3]

In [None]:
review_list

['The movie was good and we really like it',
 'the movie was good but the ending was boring',
 'we did not like the movie as it was too lengthy']

In [None]:
count_vect = CountVectorizer()

In [None]:
X_counts = count_vect.fit_transform(review_list)

In [None]:
X_counts.toarray()

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 2, 0, 2, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]], dtype=int64)

In [None]:
type(X_counts)

scipy.sparse.csr.csr_matrix

CSR means Compressed Sparse Row, which is a row-wise sparse matrix

In [None]:
X_names = count_vect.get_feature_names()
X_names

['and',
 'as',
 'boring',
 'but',
 'did',
 'ending',
 'good',
 'it',
 'lengthy',
 'like',
 'movie',
 'not',
 'really',
 'the',
 'too',
 'was',
 'we']

In [None]:
a = pd.DataFrame(X_counts.toarray(),columns=X_names)
a

Unnamed: 0,and,as,boring,but,did,ending,good,it,lengthy,like,movie,not,really,the,too,was,we
0,1,0,0,0,0,0,1,1,0,1,1,0,1,1,0,1,1
1,0,0,1,1,0,1,1,0,0,0,1,0,0,2,0,2,0
2,0,1,0,0,1,0,0,1,1,1,1,1,0,1,1,1,1


# **TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_vect = TfidfVectorizer(min_df=1, lowercase=True, stop_words='english')

In [None]:
tf_matrix = tf_vect.fit_transform(review_list)
tf_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.4804584 , 0.        ,
        0.4804584 , 0.37311881, 0.63174505],
       [0.5844829 , 0.        , 0.5844829 , 0.44451431, 0.        ,
        0.        , 0.34520502, 0.        ],
       [0.        , 0.5844829 , 0.        , 0.        , 0.5844829 ,
        0.44451431, 0.34520502, 0.        ]])

In [None]:
tf_names = tf_vect.get_feature_names()
tf_names

['boring', 'did', 'ending', 'good', 'lengthy', 'like', 'movie', 'really']

In [None]:
type(tf_matrix)

scipy.sparse.csr.csr_matrix

In [None]:
tf_matrix.shape

(3, 8)

In [None]:
tf_df = pd.DataFrame(tf_matrix.toarray(),columns=tf_names)

In [None]:
tf_df

Unnamed: 0,boring,did,ending,good,lengthy,like,movie,really
0,0.0,0.0,0.0,0.480458,0.0,0.480458,0.373119,0.631745
1,0.584483,0.0,0.584483,0.444514,0.0,0.0,0.345205,0.0
2,0.0,0.584483,0.0,0.0,0.584483,0.444514,0.345205,0.0


# **Document Similarity Estimation using TF-IDF**

In [None]:
doc1 = 'Natural Language Processing is the study of making a machine understand and generate languages like humans'
doc2 = 'Cricket is a sports played with a bat and a ball. It is not played in many countries'

In [None]:
doc3 = 'Languages are the cornerstone of human evolution. Making a machine study languages is not easy'
doc4 = 'Football is a sport played in almost all countries of the world. It is played by kicking a ball'

In [None]:
documents = [doc1, doc2, doc3, doc4]

In [None]:
tf_vect_docs = TfidfVectorizer(min_df=1, lowercase=True, stop_words='english')

In [None]:
tf_matrix_docs = tf_vect_docs.fit_transform(documents)

## **Now calculate the document similarities**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity as c_sim

In [None]:
c_sim(tf_matrix_docs[0:1], tf_matrix_docs[2:3])

array([[0.34918271]])

In [None]:
c_sim(tf_matrix_docs[1:2], tf_matrix_docs[3:4])

array([[0.51711443]])

In [None]:
c_sim(tf_matrix_docs[1:2], tf_matrix_docs[2:3])

array([[0.]])

In [None]:
c_sim(tf_matrix_docs[0:1], tf_matrix_docs[3:4])

array([[0.]])