In [32]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans

from time import time 
from collections import defaultdict


## Data Processing

In [12]:
reviews= pd.read_csv('boston/reviews.csv')

In [3]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,1178162,4724140,2013-05-21,4298113,Olivier,My stay at islam's place was really cool! Good...
1,1178162,4869189,2013-05-29,6452964,Charlotte,Great location for both airport and city - gre...
2,1178162,5003196,2013-06-06,6449554,Sebastian,We really enjoyed our stay at Islams house. Fr...
3,1178162,5150351,2013-06-15,2215611,Marine,The room was nice and clean and so were the co...
4,1178162,5171140,2013-06-16,6848427,Andrew,Great location. Just 5 mins walk from the Airp...


In [5]:
def text_to_word_list(text):
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text  

In [20]:
reviews.drop(['reviewer_name','date'], axis=1,inplace=True)

In [16]:
reviews.comments = reviews.comments.apply(lambda x: text_to_word_list(x))

## Word2Vec

In [23]:
reviews_model = reviews.copy()

In [25]:
reviews_model = reviews_model[reviews_model.comments.str.len()>1]

In [26]:
sent = [row for row in reviews_model.comments]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

['great',
 'location',
 'for',
 'both',
 'airport',
 'and',
 'city',
 'great',
 'amenities',
 'in',
 'the',
 'house',
 'plus',
 'islam',
 'was',
 'always',
 'very',
 'helpful',
 'even_though',
 'he',
 'was',
 'away']

In [29]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)
start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

Time to build vocab: 0.23 mins


In [30]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

Time to train the model: 6.75 mins


In [31]:
w2v_model.save("word2vec.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## K-means Clustering

In [33]:
word_vectors = Word2Vec.load('word2vec.model').wv

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [34]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [35]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('serious_cleaning', 0.8879114985466003),
 ('touristy_spots', 0.8815414309501648),
 ('square_footage', 0.8778349161148071),
 ('maintained_daily', 0.8775303363800049),
 ('jazmyne', 0.8744564056396484),
 ('margarita', 0.8717365264892578),
 ('becouse', 0.8645039200782776),
 ('aiming', 0.8637367486953735),
 ('go_freely', 0.8623305559158325),
 ('quick_stopover', 0.8611665964126587)]

In [36]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [37]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  This is separate from the ipykernel package so we can avoid doing imports until


In [38]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [40]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [85]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [47]:
reviews_output = reviews_model.copy()
reviews_output.comments = reviews_output.comments.apply(lambda x: ' '.join(bigram[x]))

In [48]:
reviews_output.head()

Unnamed: 0,listing_id,id,reviewer_id,comments
0,1178162,4724140,4298113,my stay at islam s place was really cool ! goo...
1,1178162,4869189,6452964,great location for both airport and city great...
2,1178162,5003196,6449554,we really_enjoyed our stay at_islams house fro...
3,1178162,5150351,2215611,the room was nice and clean and so were the co...
4,1178162,5171140,6848427,great location just 5_mins walk from the airpo...


In [107]:
reviews_output.to_csv('review_last.csv',index=False)

## Prediction

In [108]:
final_file = pd.read_csv('review_last.csv')

In [109]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(final_file.comments)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(final_file.comments)

In [111]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.comments.split()))

In [112]:

replaced_tfidf_scores = final_file.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)
#this step takes around 3-4 minutes minutes to calculate

In [113]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [114]:
replaced_closeness_scores = final_file.comments.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [121]:
replacement_df = pd.DataFrame(data=[final_file.listing_id,final_file.id,final_file.reviewer_id,final_file.comments,replaced_closeness_scores, replaced_tfidf_scores ]).T
replacement_df.columns = ['listing_id','id','reviewed_id','comments','sentiment_coeff', 'tfidf_scores']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

In [127]:
replacement_df.head()

Unnamed: 0,listing_id,id,reviewed_id,comments,sentiment_coeff,tfidf_scores,sentiment_rate,prediction
0,1178162,4724140,4298113,my stay at islam s place was really cool ! goo...,"[1.0340865918003272, 1.081973735252718, 1.0439...","[2.739815972569313, 1.9027477633752095, 2.3949...",221.908462,1
1,1178162,4869189,6452964,great location for both airport and city great...,"[1.053914608905532, 1.046935095678005, 1.04800...","[3.663483429427597, 2.1612853767894076, 1.8429...",78.089494,1
2,1178162,5003196,6449554,we really_enjoyed our stay at_islams house fro...,"[1.0380807097110565, 1.0655643933965595, 1.029...","[3.856259860314039, 4.848497120190661, 4.93764...",423.442734,1
3,1178162,5150351,2215611,the room was nice and clean and so were the co...,"[1.045809937026322, 1.0431377329714395, 1.0693...","[3.69634184583942, 2.524888895273744, 1.361725...",142.581635,1
4,1178162,5171140,6848427,great location just 5_mins walk from the airpo...,"[1.053914608905532, 1.046935095678005, 1.02714...","[1.8317417147137984, 2.1612853767894076, 6.154...",78.078184,1


In [126]:
replacement_df.to_csv('UnsupervisedSentimentAnalysis.csv')