# This notebook runs three exploratory text analyses on the airbnb reviews

I tie the reviews data to the listings data to preseve some information about location, then run:
  * __N-grams__ 
  * __TF-IDF vectorization__ to see what words get used most freqently by group
  * __KMeans clustering__ based on the TF-IDF vectorization, to see if any clusters emerge in the 480,000 reviews
  * __Latent Dirichlet allocation__ to model any common topics that may be in the reviews

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
from sklearn.externals import joblib
import seaborn
%matplotlib inline
pd.set_option('display.max_columns', 500)

# Read in the data

In [None]:
reviews = pd.read_csv('data/reviews.csv')

In [None]:
reviews.date = reviews.date.apply(lambda x: pd.to_datetime(x))

In [None]:
print('{} rows'.format(len(reviews)))
reviews.head()

The review dates go back to 2008, but a look at InsideAirbnb shows those reviews are skewed overwhelmingly towards the last two years:

http://insideairbnb.com/new-york-city/#

In [None]:
reviews.date.describe()

In [None]:
reviews.dropna(inplace=True)
len(reviews)

Let's rename the columns to match `listings.csv`, then tie the review data to info about each listing

In [None]:
reviews.columns = ['id', 'review_id', 'date', 'reviewer_id', 'reviewer_name', 'comments']

In [None]:
listings = pd.read_csv('data/listing_info.csv')

In [None]:
r = pd.merge(reviews, listings, on='id', how = 'left')

In [None]:
r.dropna(inplace=True)

In [None]:
r = r.rename(columns={'id': 'listing_id', 'date':'review_date', 'name':'listing_name'})

In [None]:
print len(r)

In [None]:
r.loc[0]['comments']

Take a subset of the dataset that filters out the reviews with automated postings. 

In [None]:
auto = 'The host canceled this reservation | days before arrival | This is an automated posting'
r = r[r.comments.str.contains(auto) == False]

In [None]:
len(r)

Let's look at the host with the most reviews during this period. 'Joe' in Harlem had 3 properties with 892 reviews over this period. 

In [None]:
bighost = r.host_id.mode()
joe = r[r.host_id == bighost[0]]
print("3 properties: {}".format(joe.listing_id.unique()))

# N-grams

In [None]:
import nltk
from nltk.util import ngrams
from textblob import TextBlob

from collections import defaultdict
from operator import itemgetter

from nltk.corpus import stopwords


In [None]:
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']
stop.remove('not')

In [None]:
def get_ngrams(col, n=2, verbose=False):
    
    counter = defaultdict(int)
    
    for rev in col:
        rev = rev.decode(encoding='utf_8')
        words = TextBlob(rev).words
        words = [w for w in words if w not in stop]
        bigrams = ngrams(words, n)
        for gram in bigrams:
            counter[gram] += 1
    if verbose == False:
        return sorted(counter.items(), key=itemgetter(1), reverse=True)[:40]
    
    if verbose == True:
        for gram, count in sorted(counter.items(), key = itemgetter(1), reverse=True)[:40]:
            phrase = " ".join(gram)
            print '%20s %i' % (phrase, count)

Look at just the 'bad' reviews. Or the ones that contain certain negative text. 

In [None]:
bad = r[r["comments"].str.contains("rude|dirty|unfortunately")]

In [None]:
get_ngrams(bad.comments, n=3, verbose=True)

# TF-IDF

Let's include some functions to tokenize and stem our words.

In [None]:

def tokenize(text):
    
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [None]:
# A custom function to tokenize the text using spaCy
# and convert to lemmas
import string
from spacy.en import English
parser = English()

SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

def spacy_tokenize(sample):

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in stop]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

#     # remove large strings of whitespace
#     while "" in tokens:
#         tokens.remove("")
#     while " " in tokens:
#         tokens.remove(" ")
#     while "\n" in tokens:
#         tokens.remove("\n")
#     while "\n\n" in tokens:
#         tokens.remove("\n\n")

    return tokens

In [None]:
len(totalvocab_stemmed)

In [None]:
spacy_tokens = []
for i in r.comments:
    i = i.decode(encoding = 'utf_8')
    tokens = spacy_tokenize(i)
    spacy_tokens.extend(tokens)

In [None]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in r.comments:
    i = i.decode(encoding='utf_8')
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
joblib.dump(totalvocab_stemmed,  'stemmed_vocab.pkl')
joblib.dump(totalvocab_tokenized, 'tokenized_vocab.pkl')

Using these two lists, I create a pandas DataFrame with the stemmed vocabulary as the index and the tokenized words as the column. The benefit of this is it provides an efficient way to look up a stem and return a full token. The downside here is that stems to tokens are one to many: the stem 'run' could be associated with 'ran', 'runs', 'running', etc. For my purposes this is fine--I'm perfectly happy returning the first token associated with the stem I need to look up.

In [None]:
len(totalvocab_stemmed)

In [None]:
dummy = ['NA']*4

In [None]:
for i in dummy: totalvocab_stemmed.append(i)

In [None]:
len(totalvocab_tokenized)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [None]:
vocab_frame2 = pd.DataFrame(({'words': totalvocab_tokenized, 'stems': totalvocab_stemmed}))

In [None]:
vocab_frame.to_csv('vocab.csv', encoding= 'utf-8')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=.99, min_df=0.005, stop_words=stop, 
                                 use_idf=True, ngram_range=(1,3))

In [None]:
%time tfidf_matrix = tfidf_vectorizer.fit_transform(list(r.comments))

print(tfidf_matrix.shape)

terms is just a list of the features used in the tf-idf matrix. This is a vocabulary


In [None]:
review_terms = tfidf_vectorizer.get_feature_names()

In [None]:
len(review_terms)

In [None]:
tfidf_matrix.shape

# K Means clustering

Now, let's cluster the reviews into k=5 types. Just to see what happens.

In [None]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)



This is a big file, so dump and load this as needed. Then, save the clusters to a csv. 

In [None]:
# joblib.dump(km,  'doc_cluster.pkl')
# km = joblib.load('doc_cluster.pkl')


In [None]:
clusters = km.labels_.tolist()

In [None]:
r['clusters'] = clusters

In [None]:
r.to_csv('with_clusters.csv')

Here's a way to turn all the text in a corpus into tokens

Let's do that for all the words in all the reviews in our dataframe

In [None]:
from __future__ import print_function

def get_top_terms(model=km, review_terms=review_terms, num_clusters=num_clusters):
    print("Top terms per cluster:")
    print()

    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

    for i in range(num_clusters):
        print("Cluster %d words:" % i, end='')

        for ind in order_centroids[i, :20]: #replace 11 with n words per cluster
            print(' %s' % review_terms[ind], end=',')
        print() #add whitespace
        print() #add whitespace

    print()
    print()

In [None]:
r.clusters.hist(bins =5, alpha=.7)

It looks like the word 'not' is only in cluster 0. So maybe the negative reviews are in there. Let's split those reviews into another subset and see how they look different

In [None]:
r0 = r[r.clusters == 0]

In [None]:
%time tfidf_matrix0 = tfidf_vectorizer.fit_transform(list(r0.comments))

print(tfidf_matrix0.shape)

In [None]:
review_terms0 = tfidf_vectorizer.get_feature_names()

In [None]:
len(review_terms0)

In [None]:
%time km.fit(tfidf_matrix0)

In [None]:
get_top_terms(model=km, review_terms=review_terms0, num_clusters=num_clusters)

In [None]:
subclusters = km.labels_.tolist()

In [None]:
r0['subcluster'] = subclusters

In [None]:
r0['subcluster'].hist(bins =5, alpha =.7)

# Latest Dirichlet Allocation (LDA)

In [None]:
import lda
import gensim
from gensim.models.ldamodel import LdaModel
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer


Let's re-vectorize all of the text into integer counts of the vocabulary, so I can perform an LDA and cluster by topic

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2), max_df=0.9, min_df=0.05, stop_words=stop)

In [None]:
vectorized_vocab = vectorizer.fit_transform(list(r.comments))

In [None]:
vectorizer.fit(list(r.comments))

In [None]:
all_features_names = vectorizer.get_feature_names()

In [None]:
len(all_features_names)

In [None]:
vectorized_vocab.shape

Try a few LDA packages

In [None]:
#Using LDA package
# lda_model = lda.LDA(n_topics=20, n_iter=500, random_state=1)

#Using Gensim. Need a new format.
# lda_model = LdaModel(vectorized_vocab)

#Using sklearn:
lda_model = LatentDirichletAllocation(n_topics = 20)


In [None]:
lda_model.fit_transform(vectorized_vocab)

In [None]:
lda_model.get_params()

In [None]:
lda_model.score(vectorized_vocab)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))


In [None]:
print_top_words(lda_model, all_features_names, 20)

In [None]:
print_top_words(lda_model, all_features_names, 20)