In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
english_reviews_df = pd.read_json('preprocessed_english_reviews_in_setences.json')
english_reviews_df.head()

Unnamed: 0,comments,comments_cleaned,comments_l,comments_token,comments_token_str,date,id,index,language,listing_id,probability,review_length,reviewer_id,reviewer_name,sentence_length,sentences,sentences_cleaned,sentences_normalised
0,Daniel is really cool. The place was nice and ...,daniel really cool place nice clean quiet neig...,daniel is really cool. the place was nice and ...,"[daniel, realli, cool, place, nice, clean, qui...",daniel realli cool place nice clean quiet neig...,2009-03-30,1191,0,en,2818,0.992321,250,10952,Lam,46,"[Daniel is really cool., The place was nice an...","[[daniel, really, cool], [place, nice, clean],...","[[daniel, realli, cool], [place, nice, clean],..."
1,Daniel is the most amazing host! His place is ...,daniel amazing host place extremely clean prov...,daniel is the most amazing host! his place is ...,"[daniel, amaz, host, place, extrem, clean, pro...",daniel amaz host place extrem clean provid eve...,2009-04-24,1771,1,en,2818,0.991255,334,12798,Alice,58,"[Daniel is the most amazing host!, His place i...","[[daniel, amazing, host], [place, extremely, c...","[[daniel, amaz, host], [place, extrem, clean, ..."
2,We had such a great time in Amsterdam. Daniel ...,great time amsterdam daniel excellent host fri...,we had such a great time in amsterdam. daniel ...,"[great, time, amsterdam, daniel, excel, host, ...",great time amsterdam daniel excel host friend ...,2009-05-03,1989,2,en,2818,0.995635,400,11869,Natalja,66,"[We had such a great time in Amsterdam., Danie...","[[great, time, amsterdam], [daniel, excellent,...","[[great, time, amsterdam], [daniel, excel, hos..."
3,Very professional operation. Room is very clea...,professional operation room clean comfortable ...,very professional operation. room is very clea...,"[profession, oper, room, clean, comfort, locat...",profession oper room clean comfort locat close...,2009-05-18,2797,3,en,2818,0.99071,203,14064,Enrique,38,"[Very professional operation., Room is very cl...","[[professional, operation], [room, clean, comf...","[[profession, oper], [room, clean, comfort], [..."
4,Daniel is highly recommended. He provided all...,daniel highly recommended provided necessities...,daniel is highly recommended. he provided all...,"[daniel, high, recommend, provid, necess, actu...",daniel high recommend provid necess actual wen...,2009-05-25,3151,4,en,2818,0.967161,277,17977,Sherwin,52,"[Daniel is highly recommended., He provided al...","[[daniel, highly, recommended], [provided, nec...","[[daniel, high, recommend], [provid, necess, a..."


## Remove cancellation messages

In [6]:
automated_message = english_reviews_df[english_reviews_df.comments.str.contains("The reservation was canceled")]
english_reviews_df.shape, automated_message.shape

((355296, 18), (639, 18))

In [18]:
removed_automated_message = english_reviews_df[~english_reviews_df.comments.str.contains("The reservation was canceled")]
removed_automated_message.shape

(354657, 18)

## TF-IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

count_vectorizer = TfidfVectorizer(
                        strip_accents='unicode',
                        preprocessor=None,
                        analyzer='word',
                        ngram_range=(1, 1),
                        min_df=10,
                        use_idf=True, smooth_idf=True, 
                        max_features = 5000)

bag_of_words = count_vectorizer.fit_transform(removed_automated_message['comments_token_str'])
print(bag_of_words.shape)

(354657, 5000)


In [20]:
count_vectorizer.get_feature_names()[:5]

['aafk', 'aart', 'abil', 'abit', 'abl']

In [21]:
count_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=10,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [22]:
dense_bow = bag_of_words.todense()
# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((dense_bow > 0).sum()/dense_bow.size)*100, "%")

Sparsicity:  0.48863933321490904 %


## LDA

In [23]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

# Tweak the two parameters below
number_topics = 5
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics, random_state=19, n_jobs=-1, learning_method='online')
listing_2818 = removed_automated_message[removed_automated_message.listing_id==2818]['comments_token_str']
bag_of_words_2818 = count_vectorizer.transform(listing_2818)

lda.fit(bag_of_words_2818)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=5, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=19, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [13]:
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic #{topic_idx+1}:")
        print([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])

In [24]:
print_topics(lda, count_vectorizer, number_words)


Topic #1:
['daniel', 'provid', 'map', 'came', 'stay', 'amsterdam', 'travel', 'took', 'late', 'would']

Topic #2:
['daniel', 'everyth', 'good', 'kind', 'well', 'part', 'clean', 'comfort', 'also', 'guest']

Topic #3:
['daniel', 'host', 'stay', 'room', 'clean', 'place', 'great', 'help', 'amsterdam', 'map']

Topic #4:
['daniel', 'jouri', 'worthi', 'chang', 'session', 'vacat', 'overal', 'weed', 'scare', 'classi']

Topic #5:
['daniel', 'stay', 'get', 'room', 'host', 'amsterdam', 'provid', 'apart', 'also', 'come']


In [12]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda.score(bag_of_words))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda.perplexity(bag_of_words))

Log Likelihood:  -11094041.80061268
Perplexity:  1684.9470294917076
