# Word Representations and Text/Document Similarity (Count Based)

## Bag of Words

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
review_1 = 'The movie was good and we really like it'
review_2 = 'the movie was good but the ending was boring'
review_3 = 'we did not like the movie as it was too lengthy'

In [None]:
review_1_tokens = word_tokenize(review_1)
review_2_tokens = word_tokenize(review_2)
review_3_tokens = word_tokenize(review_3)

In [None]:
print(review_2_tokens)
print(set(review_2_tokens))

['the', 'movie', 'was', 'good', 'but', 'the', 'ending', 'was', 'boring']
{'good', 'movie', 'ending', 'was', 'boring', 'the', 'but'}


In [None]:
review_tokens = set(review_1_tokens).union(set(review_2_tokens)).union(set(review_3_tokens))

In [None]:
print(len(review_tokens))
print(review_tokens)

18
{'good', 'ending', 'boring', 'we', 'like', 'did', 'too', 'really', 'movie', 'and', 'lengthy', 'was', 'The', 'the', 'but', 'as', 'not', 'it'}


In [None]:
review1_dict = dict.fromkeys(review_tokens,0)

In [None]:
review1_dict

In [None]:
review2_dict = dict.fromkeys(review_tokens,0)
review3_dict = dict.fromkeys(review_tokens,0)
print(review2_dict)
print(review3_dict)

{'good': 0, 'ending': 0, 'boring': 0, 'we': 0, 'like': 0, 'did': 0, 'too': 0, 'really': 0, 'movie': 0, 'and': 0, 'lengthy': 0, 'was': 0, 'The': 0, 'the': 0, 'but': 0, 'as': 0, 'not': 0, 'it': 0}
{'good': 0, 'ending': 0, 'boring': 0, 'we': 0, 'like': 0, 'did': 0, 'too': 0, 'really': 0, 'movie': 0, 'and': 0, 'lengthy': 0, 'was': 0, 'The': 0, 'the': 0, 'but': 0, 'as': 0, 'not': 0, 'it': 0}


In [None]:
# review_1 -> The movie was good and we really like it
for token in review_1_tokens:
    review1_dict[token]+=1
print(review1_dict)    

{'good': 1, 'ending': 0, 'boring': 0, 'we': 1, 'like': 1, 'did': 0, 'too': 0, 'really': 1, 'movie': 1, 'and': 1, 'lengthy': 0, 'was': 1, 'The': 1, 'the': 0, 'but': 0, 'as': 0, 'not': 0, 'it': 1}


In [None]:
review1_dict

In [None]:
for token in review_2_tokens:
    review2_dict[token]+=1

for token in review_3_tokens:
    review3_dict[token]+=1

In [None]:
print(review1_dict)
print(review2_dict)
print(review3_dict)

{'good': 1, 'ending': 0, 'boring': 0, 'we': 1, 'like': 1, 'did': 0, 'too': 0, 'really': 1, 'movie': 1, 'and': 1, 'lengthy': 0, 'was': 1, 'The': 1, 'the': 0, 'but': 0, 'as': 0, 'not': 0, 'it': 1}
{'good': 1, 'ending': 1, 'boring': 1, 'we': 0, 'like': 0, 'did': 0, 'too': 0, 'really': 0, 'movie': 1, 'and': 0, 'lengthy': 0, 'was': 2, 'The': 0, 'the': 2, 'but': 1, 'as': 0, 'not': 0, 'it': 0}
{'good': 0, 'ending': 0, 'boring': 0, 'we': 1, 'like': 1, 'did': 1, 'too': 1, 'really': 0, 'movie': 1, 'and': 0, 'lengthy': 1, 'was': 1, 'The': 0, 'the': 1, 'but': 0, 'as': 1, 'not': 1, 'it': 1}


In [None]:
# review_1 = 'The movie was good and we really like it'
# review_2 = 'the movie was good but the ending was boring'
# review_3 = 'we did not like the movie as it was too lengthy'

In [None]:
reviews_Dict_DF = pd.DataFrame([review1_dict,review2_dict,review3_dict])

In [None]:
reviews_Dict_DF

Unnamed: 0,good,ending,boring,we,like,did,too,really,movie,and,lengthy,was,The,the,but,as,not,it
0,1,0,0,1,1,0,0,1,1,1,0,1,1,0,0,0,0,1
1,1,1,1,0,0,0,0,0,1,0,0,2,0,2,1,0,0,0
2,0,0,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1


# Count Vectorizer
sklearn implementation of TFM (Term Frequency Matrix) creation using Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
review_list = [review_1,review_2,review_3]

In [None]:
print(review_list)

['The movie was good and we really like it', 'the movie was good but the ending was boring', 'we did not like the movie as it was too lengthy']


In [None]:
# count_vect = CountVectorizer()
count_vect = CountVectorizer(stop_words='english')

In [None]:
X_counts = count_vect.fit_transform(review_list)

In [None]:
type(X_counts)

scipy.sparse.csr.csr_matrix

CSR means compressed sparse row which is a row-wise sparse matrix
(having more 0 as compared to 1)

In [None]:
X_counts.toarray()

array([[0, 0, 0, 1, 0, 1, 1, 1],
       [1, 0, 1, 1, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 1, 0]])

In [None]:
X_names = count_vect.get_feature_names()
X_names

['boring', 'did', 'ending', 'good', 'lengthy', 'like', 'movie', 'really']

In [None]:
a = pd.DataFrame(X_counts.toarray(),columns=X_names)
a

Unnamed: 0,boring,did,ending,good,lengthy,like,movie,really
0,0,0,0,1,0,1,1,1
1,1,0,1,1,0,0,1,0
2,0,1,0,0,1,1,1,0


In [None]:
# review_1 = 'The movie was good and we really like it'
# review_2 = 'the movie was good but the ending was boring'
# review_3 = 'we did not like the movie as it was too lengthy'

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_vect = TfidfVectorizer(min_df=1,lowercase=True,stop_words='english')

In [None]:
review_list

['The movie was good and we really like it',
 'the movie was good but the ending was boring',
 'we did not like the movie as it was too lengthy']

In [None]:
tf_matrix = tf_vect.fit_transform(review_list)

In [None]:
type(tf_matrix)

scipy.sparse.csr.csr_matrix

In [None]:
tf_matrix.shape

(3, 8)

In [None]:
tf_names = tf_vect.get_feature_names()

In [None]:
tf_names

['boring', 'did', 'ending', 'good', 'lengthy', 'like', 'movie', 'really']

In [None]:
tf_df = pd.DataFrame(tf_matrix.toarray(),columns=tf_names)

In [None]:
tf_df

Unnamed: 0,boring,did,ending,good,lengthy,like,movie,really
0,0.0,0.0,0.0,0.480458,0.0,0.480458,0.373119,0.631745
1,0.584483,0.0,0.584483,0.444514,0.0,0.0,0.345205,0.0
2,0.0,0.584483,0.0,0.0,0.584483,0.444514,0.345205,0.0


In [None]:
# Previous value count based : Countvectorizer
a

Unnamed: 0,boring,did,ending,good,lengthy,like,movie,really
0,0,0,0,1,0,1,1,1
1,1,0,1,1,0,0,1,0
2,0,1,0,0,1,1,1,0
