In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
import nltk
nltk.download('words')
en_words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /Users/nesara/nltk_data...
[nltk_data]   Package words is already up-to-date!


#### Extend stopwords

In [18]:
"""
(0,
  '0.016*"walk" + 0.012*"live" + 0.012*"area" + 0.010*"large" + 0.010*"great" '
  '+ 0.009*"flat" + 0.009*"floor" + 0.008*"away" + 0.008*"double" + '
  '0.008*"close"'),
 (1,
  '0.022*"private" + 0.017*"place" + 0.015*"access" + 0.015*"parking" + '
  '0.012*"space" + 0.012*"walk" + 0.010*"downtown" + 0.010*"location" + '
  '0.010*"close" + 0.009*"full"'),
 (2,
  '0.009*"check" + 0.008*"area" + 0.008*"large" + 0.007*"modern" + '
  '0.007*"rental" + 0.007*"build" + 0.007*"table" + 0.006*"dining" + '
  '0.006*"property" + 0.006*"living"'
"""

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(stopwords.words('french'))
stop_words.extend(stopwords.words('german'))
#stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'room', 'apartment', 'bedroom', 'home', 'bed', 'bathroom', 'kitchen',
#                  'NEIGHBORHOOD', 'neighborhood', 'apartment', "pie", "acce", "tre", "appartement", "commerce", "immeuble",
#                 ])

stop_words.extend(["room", "bedroom", "apartment", "bed", "bathroom", "floor", "kitchen", "living"])


#### Load dataset

In [4]:
columns = ['ID', 'Name', 'City', 'Summary', 'Space', 'Description', 'Neighborhood Overview', 'House Rules']
df = pd.read_csv("../../data/airbnb-data-science/airbnb-listings.csv", usecols=columns, sep=';')
df = df[columns]
#df = pd.read_csv("../../data/airbnb-data-science/airbnb-listings-random100nyc.csv")
#df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
def get_vibe_text(x):
    #print(x)
    text = ''
    #if not pd.isnull(x["Space"]):
    #    text += x["Space"]
    #if not pd.isnull(x["Description"]):
    #    text += x["Description"]
    if not pd.isnull(x["Neighborhood Overview"]):
        text += x["Neighborhood Overview"]
    #if not pd.isnull(x["House Rules"]):
    #    text += x["House Rules"]
    text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in en_words or not w.isalpha())
    return text
#df["vibe_text"] = df.apply(get_vibe_text, axis=1)
#df

#### Extract listings description

In [19]:
# Convert to list
#data = df.sample(n=10000, random_state=1)["vibe_text"].values.tolist()
df_nyc = df[df["City"]=="New York"].copy()
df_nyc.reset_index(inplace=True)
del df_nyc["index"]
df_nyc.reset_index(inplace=True)
df_nyc.rename(columns={"index":"Document_No"}, inplace=True)
df_nyc.head(2)

Unnamed: 0,Document_No,ID,Name,City,Summary,Space,Description,Neighborhood Overview,House Rules
0,0,1940118,Apartment In Heart Of LA,New York,Adorable apartment in the heart of Los Angeles...,The restored Art Deco apartment is ground floo...,Adorable apartment in the heart of Los Angeles...,,
1,1,8844261,Large Sunny Brooklyn Room on first floor,New York,My place is close to The Jewish Children's Mus...,"The apartment is located on the first floor, s...",My place is close to The Jewish Children's Mus...,The neighborhood is a safe section of Brooklyn...,"- We live in a friendly, safe neighborhood, pl..."


In [20]:
data = df_nyc["Description"].values.tolist()
len(data)

19528

#### Tokenise words

In [21]:
def sent_to_words(sentences):
    i = 0
    for sentence in sentences:
        #if i%10000 == 0:
        #    print(i)
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        i+=1

data_words = list(sent_to_words(data))

#### Create bigram and trigram models

In [22]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

#### Remove Stopwords, Make Bigrams and Lemmatize

In [23]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#### Create the Dictionary and Corpus needed for Topic Modeling

In [24]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [108]:
idx = 0
data[idx]

'Adorable apartment in the heart of Los Angeles, located off the famous Melrose Boulevard and walking distance to the Walk Of Fame, Sunset Boulevard and The Grove.  Comfortably sleeps 4 - a queen size bed plus a full size pull-out. The restored Art Deco apartment is ground floor, with private entry. The building features a large courtyard, perfect for entertaining.'

In [117]:
print(data_words[idx])

['adorable', 'apartment', 'in', 'the', 'heart', 'of', 'los', 'angeles', 'located', 'off', 'the', 'famous', 'melrose', 'boulevard', 'and', 'walking', 'distance', 'to', 'the', 'walk', 'of', 'fame', 'sunset', 'boulevard', 'and', 'the', 'grove', 'comfortably', 'sleeps', 'queen', 'size', 'bed', 'plus', 'full', 'size', 'pull', 'out', 'the', 'restored', 'art', 'deco', 'apartment', 'is', 'ground', 'floor', 'with', 'private', 'entry', 'the', 'building', 'features', 'large', 'courtyard', 'perfect', 'for', 'entertaining']


In [109]:
print(data_words_nostops[idx])

['adorable', 'heart', 'los', 'angeles', 'located', 'famous', 'melrose', 'boulevard', 'walking', 'distance', 'walk', 'fame', 'sunset', 'boulevard', 'grove', 'comfortably', 'sleeps', 'queen', 'size', 'plus', 'full', 'size', 'pull', 'restored', 'art', 'deco', 'ground', 'private', 'entry', 'building', 'features', 'large', 'courtyard', 'perfect', 'entertaining']


In [110]:
print(data_words_bigrams[idx])

['adorable', 'heart', 'los', 'angeles', 'located', 'famous', 'melrose', 'boulevard', 'walking', 'distance', 'walk', 'fame', 'sunset', 'boulevard', 'grove', 'comfortably', 'sleeps', 'queen', 'size', 'plus', 'full', 'size', 'pull', 'restored', 'art_deco', 'ground', 'private', 'entry', 'building', 'features', 'large', 'courtyard', 'perfect', 'entertaining']


In [111]:
print(data_lemmatized[0])

['adorable', 'heart', 'locate', 'walk', 'distance', 'walk', 'comfortably', 'sleep', 'size', 'full', 'size', 'pull', 'restore', 'ground', 'private', 'entry', 'building', 'feature', 'large', 'courtyard', 'perfect', 'entertaining']


In [113]:
print(id2word.doc2idx(texts[idx]))

[0, 10, 12, 19, 4, 19, 2, 18, 17, 8, 17, 15, 16, 9, 14, 6, 1, 7, 11, 3, 13, 5]


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 2)]


In [124]:
#print(corpus[idx])
print([[(id2word[id], freq) for id, freq in cp] for cp in [corpus[idx]]])

[[('adorable', 1), ('building', 1), ('comfortably', 1), ('courtyard', 1), ('distance', 1), ('entertaining', 1), ('entry', 1), ('feature', 1), ('full', 1), ('ground', 1), ('heart', 1), ('large', 1), ('locate', 1), ('perfect', 1), ('private', 1), ('pull', 1), ('restore', 1), ('size', 2), ('sleep', 1), ('walk', 2)]]


In [115]:
idword_mapping = ""
for i in range(20):
    # print(str(i) + " - " + id2word.get(i))
    idword_mapping += "(" + str(i) + ", " + id2word.get(i) + "), "
idword_mapping

'(0, adorable), (1, building), (2, comfortably), (3, courtyard), (4, distance), (5, entertaining), (6, entry), (7, feature), (8, full), (9, ground), (10, heart), (11, large), (12, locate), (13, perfect), (14, private), (15, pull), (16, restore), (17, size), (18, sleep), (19, walk), '

#### Building the Topic Model

In [25]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

#### View the topics in LDA model 

In [27]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
#doc_lda = lda_model[corpus]

[(0,
  '0.036*"walk" + 0.031*"restaurant" + 0.027*"block" + 0.025*"place" + '
  '0.025*"train" + 0.024*"away" + 0.024*"subway" + 0.022*"minute" + '
  '0.021*"close" + 0.019*"good"'),
 (1,
  '0.021*"guest" + 0.020*"stay" + 0.015*"space" + 0.015*"share" + '
  '0.014*"available" + 0.014*"home" + 0.012*"use" + 0.012*"private" + '
  '0.012*"access" + 0.011*"need"'),
 (2,
  '0.028*"full" + 0.022*"large" + 0.018*"size" + 0.014*"tv" + 0.014*"private" '
  '+ 0.013*"include" + 0.013*"fully" + 0.011*"space" + 0.011*"building" + '
  '0.011*"high"')]


#### Compute Model Perplexity and Coherence Score

In [28]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.709991536697921

Coherence Score:  0.5183751596913325


#### Visualize the topics-keywords

In [29]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [153]:
### Create gif of topics
import os
import imageio
images = []
#for filename in filenames:
for filename in os.listdir("images/"):
    images.append(imageio.imread("images/"+ filename))
imageio.mimsave('topics.gif', images, duration=2)

In [165]:
### Create gif of search screenshot
images = []
#for filename in filenames:
for filename in os.listdir("vibe_search_images/"):
    images.append(imageio.imread("vibe_search_images/"+ filename))
imageio.mimsave('vibe_search.gif', images, duration=2)

#### Find dominant topic in each document

In [61]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        #print(i)
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2.0,0.6722,"full, large, size, tv, private, include, fully...",Adorable apartment in the heart of Los Angeles...
1,1,0.0,0.4023,"walk, restaurant, block, place, train, away, s...",My place is close to The Jewish Children's Mus...
2,2,2.0,0.5945,"full, large, size, tv, private, include, fully...",Private bedroom (75 sq ft) with full mattress ...
3,3,2.0,0.5678,"full, large, size, tv, private, include, fully...","Our huge 2 bedroom apartment with 18"" ceilings..."
4,4,1.0,0.4418,"guest, stay, space, share, available, home, us...",Recently renovated East Village Studio Apartme...
5,5,2.0,0.5049,"full, large, size, tv, private, include, fully...",Private 1 bedroom apartment in the heart of Ea...
6,6,1.0,0.5152,"guest, stay, space, share, available, home, us...",** Please message first before attempting to b...
7,7,0.0,0.4395,"walk, restaurant, block, place, train, away, s...","Come stay in this spacious, light-filled East ..."
8,8,0.0,0.4197,"walk, restaurant, block, place, train, away, s...",Perfect 1 to 2 bedroom apartment in the heart ...
9,9,2.0,0.7816,"full, large, size, tv, private, include, fully...",Perfect for families or groups of friends! Lo...


In [136]:
dftt = df_dominant_topic.sort_values("Topic_Perc_Contrib", ascending=False)
dftt[dftt["Dominant_Topic"]==1.0]

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
19076,19076,1.0,0.9228,"guest, stay, space, share, available, home, us...","Hi There, I'm Michelle and I am excited to sh..."
8550,8550,1.0,0.9139,"guest, stay, space, share, available, home, us...",Hi! Me and my roommates just moved out from th...
17252,17252,1.0,0.9139,"guest, stay, space, share, available, home, us...",Hi! Me and my roommates just moved out from th...
19475,19475,1.0,0.9139,"guest, stay, space, share, available, home, us...",Hi! Me and my roommates just moved out from th...
3308,3308,1.0,0.8912,"guest, stay, space, share, available, home, us...",Please DO NOT SUBMIT A BOOKING REQUEST WITHOUT...
15875,15875,1.0,0.8846,"guest, stay, space, share, available, home, us...",IMPORTANT: PLEASE DO NOT send us a reservation...
15874,15874,1.0,0.8838,"guest, stay, space, share, available, home, us...",IMPORTANT: PLEASE DO NOT send us a reservation...
4601,4601,1.0,0.8764,"guest, stay, space, share, available, home, us...",Please DO NOT SUBMIT A BOOKING REQUEST WITHOUT...
11236,11236,1.0,0.8743,"guest, stay, space, share, available, home, us...","Hello, I am looking for someone to share my fl..."
5999,5999,1.0,0.8689,"guest, stay, space, share, available, home, us...","THINGS TO NOTE: This is room 2 of 2, both of w..."


In [135]:
df_nyc[df_nyc["Document_No"]==10241]["Description"].values

array(['Enjoy NYC at its finest in an impeccable NYC artistic loft-- more than 2,000 square feet, immense lighting, 2 bedroom, 2 full baths.  It has all new and modern fixtures/finishes in kitchen and bathrooms.  Washer dryer and tons of storage space!   This is a gorgeous artist loft, rare and unique to NYC.  It is a huge spacious sunny space with a large living space and dining area.  Entire loft is finished with bright hardwood floors and the space has high ceilings with exposed brick and structural beams to add to the loft feel.  We have 2 spacious bedrooms and 2 full baths.  Master bedroom has a king sized bed and guest bedroom has a queen sized bed and The Master has an attached dressing area and attached full bathroom.  Large, south-facing windows allow for tremendous light in this beautiful loft.  Kitchen boasts all new appliances, a dishwasher, and an island.  There is a large dining table in the dining area.  Lliving room with hi-def smart TV 70" in size. Fuly cable attachmen

In [147]:
df_nyc[df_nyc["Document_No"]==16322]["Description"].values

array(["THINGS TO NOTE: This is room 1 of 2, both of which are on the lower floor of a multi-floor apartment space.  I and my co-host will be upstairs if you need anything, so you will practically have the entire space downstairs to yourselves IF YOU RENT BOTH ROOMS.  Otherwise, you will be sharing the space with us or another guest.  If you need anything, simply knock on our door upstairs! - There's a SECURITY CAMERA aimed at the entrance/exit to this unit as part of our business insurance. PermaGO® is focused on delivering a quality travel experience! For the first time ever, we are giving NON-members a chance to experience our exclusive network of Home Bridges®, or SHARED International Guesthouses in New York City. Learn more! THE SPACE You know that feeling you get when you stumble upon something simply amazing and you don't exactly know how to proceed?  By now, you've seen what our PermaGO Home Bridges look like: they're clean, modern, and amazingly comfortable. If you've done som

In [62]:
len(df_dominant_topic)

19528

In [164]:
#dff = dff.reset_index()
#dff.rename(columns={"index": "Document_No"}, inplace=True)
dff = df_nyc.copy()
dff = dff[["Document_No", "ID"]]
dff = dff.set_index("Document_No").join(df_dominant_topic[["Document_No", "Dominant_Topic", "Topic_Perc_Contrib"]].set_index("Document_No")).reset_index()
dff.dropna(inplace=True)
dff.to_csv("df_topic.csv", index=False)
dff

Unnamed: 0,Document_No,ID,Dominant_Topic,Topic_Perc_Contrib
0,0,1940118,2.0,0.6722
1,1,8844261,0.0,0.4023
2,2,16912472,2.0,0.5945
3,3,8411483,2.0,0.5678
4,4,3409820,1.0,0.4418
5,5,9024874,2.0,0.5049
6,6,144855,1.0,0.5152
7,7,7801169,0.0,0.4395
8,8,4530085,0.0,0.4197
9,9,860837,2.0,0.7816


#### Topic distribution across documents

In [65]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,2.0,"full, large, size, tv, private, include, fully...",9785.0,0.5011
1,0.0,"walk, restaurant, block, place, train, away, s...",3853.0,0.1973
2,2.0,"full, large, size, tv, private, include, fully...",5890.0,0.3016
3,2.0,"full, large, size, tv, private, include, fully...",,
4,1.0,"guest, stay, space, share, available, home, us...",,
5,2.0,"full, large, size, tv, private, include, fully...",,
6,1.0,"guest, stay, space, share, available, home, us...",,
7,0.0,"walk, restaurant, block, place, train, away, s...",,
8,0.0,"walk, restaurant, block, place, train, away, s...",,
9,2.0,"full, large, size, tv, private, include, fully...",,


### Find the most representative document for each topic

In [66]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.9399,"walk, restaurant, block, place, train, away, s...",Located in the heart of the lower east side Ma...
1,1.0,0.9228,"guest, stay, space, share, available, home, us...","Hi There, I'm Michelle and I am excited to sh..."
2,2.0,0.952,"full, large, size, tv, private, include, fully...","Beautiful, classic pre-war Manhattan apartment..."


## A/B Testing