## Importing Necesseties

In [1]:
import pandas as pd
import numpy as np

import re
import string

from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity

import json

## Reading Data

In [2]:
df = pd.read_csv('../assets/review_sentences_with_sentiment.csv')

## Text Cleaning

In [3]:
'''
Make text lowercase, remove text in square brackets, remove links, remove HTML tags,
remove punctuation, remove words containing numbers, remove all single characters, 
and substitute multiple spaces with single space.
'''
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

In [4]:
df['cleaned'] = df['sentence'].apply(lambda x:clean_text(x))

# delete row with missing values
df['cleaned'].replace('', np.nan, inplace=True)
df = df.dropna(axis=0)

df.head(5)

Unnamed: 0,sentence,sentiment,cleaned
0,"It's a pretty good, inexpensive casual or busi...",1,its pretty good inexpensive casual or business...
1,"For example, it has no side pleats in back and...",0,for example it has no side pleats in back and ...
2,If you do you'll be disappointed,2,if you do youll be disappointed
3,This is only noticeable to me and only because...,0,this is only noticeable to me and only because...
4,"Also, there are lots of loose threads from pro...",2,also there are lots of loose threads from prod...


In [5]:
corpus = []
for words in df['cleaned']:
    corpus.append(words.split())

## Word Embedding

In [6]:
word2vec_model = Word2Vec(corpus, size = 64, min_count = 2)

## Finding Mean of Word Vector 

In [7]:
def vectors(document_list):
    document_embedding_list = []

    for line in document_list:
        doc2vec = None
        count = 0
        for word in line.split():
            if word in word2vec_model.wv.vocab:
                count += 1
                # Add vector values of all words in the document
                if doc2vec is None:
                    doc2vec = word2vec_model[word]
                else:
                    doc2vec = doc2vec + word2vec_model[word]

        if doc2vec is not None:
            # Divide the vector of all the word vectors by the length of the document
            doc2vec = doc2vec / count
            document_embedding_list.append(doc2vec)

    # Returns a list of document vectors for each document
    return document_embedding_list

In [8]:
# List of keywords you want to analyze in detail
keywords = ['fit', 'color', 'material']

In [9]:
keyword_embedding_list = vectors(keywords)
document_embedding_list = vectors(df['cleaned'])

  doc2vec = word2vec_model[word]
  doc2vec = doc2vec + word2vec_model[word]


## Finding the Most Relevant Reviews by Cosine Similarity

In [10]:
cosine_similarities = cosine_similarity(keyword_embedding_list, document_embedding_list)

In [11]:
def most_relevant_review(review_num):
    for keyword_index in range(len(keywords)):
        sim_scores = list(enumerate(cosine_similarities[keyword_index]))
        sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
        sim_scores = sim_scores[0:review_num]

        # indices of most relevant reviews
        indices = [i[0] for i in sim_scores]
        
        keyword = keywords[keyword_index]
        keyword_reviews = []
        for index in indices:
            review = dict()
            review['id'] = index
            review['sentence'] = df['sentence'].iloc[index]
            review['sentiment'] = df['sentiment'].iloc[index]
            keyword_reviews.append(review)
            
        relevant_reviews[keyword] = keyword_reviews

In [12]:
relevant_reviews = dict()
most_relevant_review(5)

In [13]:
print(relevant_reviews)

{'fit': [{'id': 9469, 'sentence': 'He is 16 and 6 foot 3, so if you are ordering for someone tall be cautious', 'sentiment': '0'}, {'id': 9946, 'sentence': 'Fit great', 'sentiment': '1'}, {'id': 10509, 'sentence': 'The shirt fits Perfect', 'sentiment': '1'}, {'id': 10882, 'sentence': 'remember to size up to make a shirt not feel skin tight 2-3" for tighter fit, 4 - 5 regular fit, 6+ baggy', 'sentiment': '0'}, {'id': 14050, 'sentence': 'but i have never bought a shirt that fits this well', 'sentiment': '1'}], 'color': [{'id': 1427, 'sentence': 'Great shirt - bought it for my husband for a wedding, it wasn_ see-thru and a great color', 'sentiment': '1'}, {'id': 5970, 'sentence': 'Great purchase, materials great, husband is very happy', 'sentiment': '1'}, {'id': 6786, 'sentence': "It's a little tight in the chest", 'sentiment': '2'}, {'id': 252, 'sentence': 'This is a really nice looking shirt', 'sentiment': '1'}, {'id': 10930, 'sentence': "It's a regular fabric dressy shirt with a denim 

In [14]:
# save into json
with open('./relevant_reviews.json', 'w', encoding='utf-8') as make_file:
    json.dump(relevant_reviews, make_file, indent="\t")