## Word Similarity

In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
amazon=pd.read_csv("amazon_reviews_big.csv")
amazon.shape

(100000, 8)

In [4]:
docs = amazon['reviewText'].fillna("").str.lower()
docs = docs.str.replace('[^a-z ]','')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['','work','like','one','use'])
stemmer = nltk.stem.PorterStemmer()

def clean_sentence(text):
    words = text.split(' ')
    words_clean = [stemmer.stem(word) for word in words if word not in stopwords]
    words_clean = [word for word in words_clean if word not in stopwords]
    return ' '.join(words_clean)
docs_clean = docs.apply(clean_sentence)

In [5]:
vectorizer = CountVectorizer(min_df=5)
vectorizer.fit(docs_clean)
dtm = vectorizer.transform(docs_clean)
df_dtm = pd.DataFrame(dtm.toarray(),columns=vectorizer.get_feature_names())

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
cosine_similarity([df_dtm['camera'], df_dtm['quality']])

array([[1.        , 0.02874162],
       [0.02874162, 1.        ]])

In [8]:
cosine_similarity([df_dtm['camera'], df_dtm['quality']])[0,1]

0.028741624476454487

In [9]:
word1 = 'camera'
cos_values = pd.DataFrame()
for word2 in df_dtm.columns.drop(word1):
    cos = cosine_similarity([df_dtm[word1],df_dtm[word2]])[0,1]
    cos_values = cos_values.append({'word1': word1,'word2':word2,'cos':cos},ignore_index=True)

In [10]:
word1 = 'camera'

In [11]:
cos_values.sort_values('cos',ascending=False).head(5)

Unnamed: 0,cos,word1,word2
13322,0.468569,camera,shoot
14752,0.448427,camera,take
17470,0.44013,camera,zoom
13345,0.425506,camera,shot
10990,0.409816,camera,pictur
