# AirBnB Reviews Topic Modelling: Full Review Tokens

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

## Load Data

In [2]:
def load_review_data(directory):
    """Load Review Data"""
    reviews_df = pd.read_csv(directory + 'interim/review_wrangled.csv', sep=';', lineterminator='\n').drop(columns=['Unnamed: 0'])
    
    return reviews_df

In [3]:
# Select City
country = 'united-states'
city = 'san-francisco'

# Directory
directory = '../data/' + country + '/' + city + '/'

# Load Data
reviews_df = load_review_data(directory)

In [4]:
reviews_df.head(3)

Unnamed: 0,listing_id,id,date,comments,tokens,tokens_count,name_entities,name_entities_count,comments_no_ne,no_ne_tokens,no_ne_tokens_count,nouns,nouns_counts,verbs,verbs_counts,adjectives,adjectives_counts
0,958,5977,2009-07-23,"Our experience was, without a doubt, a five st...","['experience', 'without', 'doubt', 'five', 'st...",47,"['David', 'Haight', 'Castro', 'Golden Gate Par...",5,"Our experience was, without a doubt, a five st...","['experience', 'without', 'doubt', 'five', 'st...",39,"['experience', 'doubt', 'star', 'experience', ...",30,"[u'be', u'be', 'accomodating', u'honor', u'be'...",10,"['consummate', 'full', 'perfect', 'full', 'clo...",6
1,958,6660,2009-08-03,Returning to San Francisco is a rejuvenating t...,"['returning', 'san', 'francisco', 'rejuvenatin...",36,"['San Francisco', 'Holly', 'David']",3,Returning to is a rejuvenating thrill but thi...,"['returning', 'rejuvenating', 'thrill', 'time'...",32,"['san', 'francisco', 'rejuvenating', 'thrill',...",19,"[u'return', u'be', u'be', u'enhance', u'renova...",14,"['great', 'local', 'such', 'amenable']",4
2,958,11519,2009-09-27,We were very pleased with the accommodations a...,"['pleased', u'accommodation', 'friendly', 'nei...",67,"['David', 'Haight Street', 'Castro Street']",3,We were very pleased with the accommodations a...,"['pleased', u'accommodation', 'friendly', 'nei...",62,"[u'accommodation', 'neighborhood', 'bed', 'fut...",41,"[u'be', u'be', 'make', u'be', u'have', u'be', ...",21,"['pleased', 'friendly', 'able', 'second', 'hel...",16


# Clean Data

In [5]:
import ast

"""String Lists to Lists"""
reviews_df['tokens'] = reviews_df['tokens'].map(lambda x: ast.literal_eval(x))
reviews_df['name_entities'] = reviews_df['name_entities'].map(lambda x: ast.literal_eval(x))
reviews_df['no_ne_tokens'] = reviews_df['no_ne_tokens'].map(lambda x: ast.literal_eval(x))
reviews_df['nouns'] = reviews_df['nouns'].map(lambda x: ast.literal_eval(x))
reviews_df['verbs'] = reviews_df['verbs'].map(lambda x: ast.literal_eval(x))
reviews_df['adjectives'] = reviews_df['adjectives'].map(lambda x: ast.literal_eval(x))

# Topic Modelling

In [6]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
import time
ldam = LdaMulticore

# LDA Model Inputs
num_topics = 50
num_words = 10
passes = 50

# Get Review Tokens
token_texts = list(reviews_df['tokens'].values)

# Create a corpus from a list of texts
common_dictionary = Dictionary(token_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in token_texts]

# Get Start Time
start_time = time.time()

# LDA Model
ldam_model = ldam(common_corpus, num_topics=num_topics, id2word=common_dictionary, passes=passes)
model_end_time = time.time() # Model End Time

# LDA Results
results = ldam_model.print_topics(num_topics=num_topics, num_words=num_words)
result_time = time.time() # Results Time

  utils.PersistentlyDeprecated2018,


In [7]:
# Time For LDA Model
(model_end_time - start_time)/60/60

3.6543464689122307

## Save Model

In [8]:
# Save Model
ldam_model.save('../models/ldam_reviews_50topics_10words_50passes_full.model')

## Results

In [13]:
def display_results(results):
    for index, results in results:
        print(str(index) + ': ' + str(', '.join(results.split('"')[1::2])))
        
def display_results_no_duplicates(results):
    all_lists = []
    for index, result in results:
        all_lists = all_lists + result.split('"')[1::2]
    
    # Get Counts of each word
    counts = pd.Series(all_lists).value_counts()
    no_duplicates = counts[counts == 1].index
    
    for index, result in results:
        print(str(index) + ': ' + str(', '.join([word for word in result.split('"')[1::2] if word in no_duplicates])))

# Topic Results

In [7]:
display_results(results[0:50])

0: ben, tower, greg, debbie, vista, siempre, nikki, venue, coit, cocina
1: und, die, sehr, ist, wir, der, war, man, mit, das
2: n't, place, would, room, night, bit, stay, noise, one, nice
3: day, back, garden, morning, night, sunset, lovely, loved, one, wonderful
4: city, place, quiet, great, perfect, neighborhood, space, spot, studio, stay
5: 10/10, charm, barbara, lady, face, chip, recommand, address, conforme, painted
6: touch, breakfast, coffee, snack, thoughtful, wine, even, morning, left, provided
7: great, gave, local, tip, recommendation, host, city, area, helpful, provided
8: really, enjoyed, stay, thank, much, hospitality, cat, appreciated, thanks, staying
9: san, francisco, fran, visit, perfect, trip, time, visiting, fransisco, explore
10: room, bathroom, private, bedroom, clean, kitchen, living, space, bed, shared
11: house, people, place, n't, meet, friendly, really, get, time, great
12: per, casa, joyce, con, molto, zona, non, muito, una, com
13: detail, michelle, gem, hi

In [10]:
display_results_no_duplicates(results)

0: ben, tower, greg, debbie, vista, siempre, nikki, venue, coit, cocina
1: und, die, sehr, ist, wir, der, war, man, mit, das
2: bit, noise
3: garden, sunset, loved
4: quiet, neighborhood, studio
5: 10/10, charm, barbara, lady, face, chip, recommand, address, conforme
6: touch, breakfast, snack, thoughtful, wine, left
7: gave, local, tip, recommendation, area, helpful
8: enjoyed, thank, hospitality, cat, appreciated, thanks
9: san, francisco, fran, visiting, fransisco, explore
10: private, bedroom, living, shared
11: people, meet, friendly
12: per, joyce, molto, zona, non, muito, com
13: detail, michelle, gem, hidden, attention, rob, jenny, sfo, website
14: towel, shower
15: ..., accessible, easily, paul, bike, uber/lyft, app, readily, helen, whilst
16: light, jeff, wish, longer, decor, art, unique, style, amy, natural
17: late, early, arrived, accommodating, let, flight, last
18: muy, que, para, todo, del, los
19: gate, golden, beach, ocean, bridge, near
20: experience, first, went, wa

## Evaluate Topic Models

In [10]:
from gensim.models.coherencemodel import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ' + str(ldam_model.log_perplexity(common_corpus)))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldam_model, texts=token_texts, dictionary=common_dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ' + str(coherence_lda))


Perplexity: -12.050553834102327

Coherence Score: 0.5659598978242838


# 30 Topics 10 Words 1 Passes

In [17]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
import time
ldam = LdaMulticore

# LDA Model Inputs
num_topics = 30
num_words = 10
passes = 10

# Get Review Tokens
token_texts1 = list(reviews_df['tokens'].values)

# Create a corpus from a list of texts
common_dictionary1 = Dictionary(token_texts1)
common_corpus1 = [common_dictionary1.doc2bow(text) for text in token_texts1]

# Get Start Time
start_time = time.time()

# LDA Model
ldam_model1 = ldam(common_corpus1, num_topics=num_topics, id2word=common_dictionary1, passes=passes)
model_end_time = time.time() # Model End Time

# LDA Results
results1 = ldam_model1.print_topics(num_topics=num_topics, num_words=num_words)
result_time = time.time() # Results Time

In [18]:
# Time For LDA Model
(model_end_time - start_time)/60/60

0.22244854383998447

In [20]:
display_results(results1)

0: home, feel, like, felt, made, house, beautiful, staying, host, welcome
1: view, kitchen, apartment, well, hill, beautiful, equipped, garden, deck, day
2: und, die, sehr, ist, wir, war, der, mit, man, das
3: arrival, day, host, reservation, upon, posting, automated, canceled, anna, responds
4: parking, street, car, easy, find, spot, family, space, house, free
5: stay, host, clean, located, comfortable, wonderful, well, quiet, apartment, lovely
6: san, francisco, question, stay, quick, place, would, fran, respond, time
7: coffee, touch, breakfast, snack, even, provided, wine, water, morning, fridge
8: time, n't, night, day, room, even, check, would, late, early
9: place, could, n't, time, stay, back, better, everything, perfect, stayed
10: people, traveler, friendly, staff, brian, hostel, fun, mike, angela, community
11: public, city, transportation, close, easy, great, area, transport, location, access
12: great, restaurant, mission, place, close, shop, neighborhood, street, walk, qu