In [1]:
# Run in python console
import nltk; nltk.download('stopwords')
import spacy

import re
import os
import numpy as np
import pandas as pd
import zipfile

from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
import en_core_web_sm

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from langdetect import detect

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


import shutil
import glob
from tika import parser

from yellowbrick.features import rank2d
from yellowbrick.classifier import confusion_matrix
from yellowbrick.classifier import classification_report
from yellowbrick.regressor import prediction_error, ResidualsPlot

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielacollaguazo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Importing data into DataFrame

In [2]:
path = '../data/new-york-city-airbnb-open-data/'
listings_csv = os.path.join(path,'listings.csv')

listings_df =  pd.read_csv(listings_csv,low_memory=False)

In [None]:
# for col in listings_df.columns:
#     print(col)

In [None]:
((listings_df.summary.isna().sum())/listings_df.shape[0])*100

## Exploring DataFrames

In [None]:
# Looking for text in the data
listings_df.dtypes

In [3]:
def predict_lang(x):
    lang=''
    txt_len=len(x)
    if txt_len>50:
        try:
            lang=detect(x)
        except Exception as e:
            lang=''
    return lang

In [4]:
listings_df['content'] = listings_df['name'] + listings_df['description']
listings_df.dropna(subset=['content'], how='any', axis=0, inplace=True)
listings_df['content_lang'] = listings_df.content.apply(lambda x: predict_lang(x))

In [5]:
# getting only text in English
listings_df=listings_df[listings_df.content_lang=='en']

In [6]:
listings_df.shape

(49022, 108)

In [8]:
listings_df.to_csv(os.path.join(path,'content_en.csv'))

In [9]:
def generate_list_content(df):
#     df['content'] = df['name'] + df['summary']
#     df['content'] = df['content'].replace(np.nan, '', regex=True)
    
    # Convert to list
    data = df['content'].values.tolist()

    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    
    
    return data

In [10]:
lst_texts = generate_list_content(listings_df)
# listings_df['content']

In [11]:
len(lst_texts)

49022

## Tokenize words and clean up text

In [12]:
# here we are tokenizing each document.
def content_to_words(lst_texts):
    for text in lst_texts:
        yield(gensim.utils.simple_preprocess(str(text), deacc=True))
        
# data_words is a list where each element is the tokenized document
tokenized_content = list(content_to_words(lst_texts))

## Create bigram and trigrams

In [13]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(tokenized_content, min_count=10, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[tokenized_content], threshold=100)  


# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

## Remove Stopwords, Make Bigrams and Lemmatize

In [14]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(tokenized_content):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in tokenized_content]

def make_bigrams(tokenized_content):
    return [bigram_mod[doc] for doc in tokenized_content]

def make_trigrams(tokenized_content):
    return [trigram_mod[bigram_mod[doc]] for doc in tokenized_content]

def lemmatization(tokenized_content, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in tokenized_content:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
nlp = en_core_web_sm.load(disable=['parser', 'ner'])

In [15]:
# Remove Stop Words
tokenized_content_nostops = remove_stopwords(tokenized_content)

# Form Bigrams
tokenized_content_bigrams = make_bigrams(tokenized_content_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
tokenized_content_lemmatized = lemmatization(tokenized_content_bigrams, 
                                             allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

## Create the Dictionary and Corpus needed for Topic Modeling

In [16]:
# Create Dictionary:
# Mapping from word IDs to words. 
# It is used to determine the vocabulary size, as well as for debugging and topic printing.
id2word = corpora.Dictionary(tokenized_content_lemmatized)
# print(len(id2word)) # corpus has 14118 unique tokens

# Term Document Frequency
# Convert document (a list of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples.
# Word with their corresponding id
corpus = [id2word.doc2bow(text) for text in tokenized_content_lemmatized]

# View
# print(corpus[:1])

## Build list of topic models

In [None]:
def build_topics(num_topics):
    list_models=[]
    for n in num_topics:
        topic_name = 'lda_model_' + str(n)
        topic_name = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, 
                                                     num_topics=n, random_state=100, 
                                                     update_every=1, chunksize=100, 
                                                     passes=10, alpha='auto', 
                                                     per_word_topics=True)
        list_models.append(topic_name)
    return list_models

num_topics = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
models = build_topics(num_topics)

## Compute Model Perplexity and Coherence Score for all models

In [None]:
def calc_perplexity_coherence(models):
    list_perplexity = []
    list_coherence = []
    
    for model in models:
        list_perplexity.append(model.log_perplexity(corpus))
        coherence_model_lda = CoherenceModel(model=model, texts=tokenized_content_lemmatized, 
                                             dictionary=id2word, coherence='c_v')
        list_coherence.append(coherence_model_lda.get_coherence())
    return list_perplexity, list_coherence

x = calc_perplexity_coherence(models)      

In [None]:
df_metrics = pd.DataFrame(list(x)).transpose()
df_metrics.columns = ['Perplexity','Coherence']
df_metrics['Number of topics'] = num_topics
df_metrics

## Graphic of number of Topics and Perplexity

In [None]:
# Perplexity needs to be as low as possible 
plt.plot( 'Number of topics', 'Perplexity', data=df_metrics, color='skyblue')
plt.xlabel("Number of Topics")
plt.ylabel("Perplexity")
plt.show()

## Graphic of number of Topics and Coherence

In [None]:
# Perplexity needs to be as low as possible 
plt.plot( 'Number of topics', 'Coherence', data=df_metrics, color='orange')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence")
plt.show()

## Most salient topic per file using results of Model of choice

In [None]:
# def format_topics_sentences(ldamodel=None, corpus=corpus, texts=lst_texts):
#     # Init output
#     sent_topics_df = pd.DataFrame()

#     # Get main topic in each document
#     for i, row_list in enumerate(ldamodel[corpus]):
#         row = row_list[0] if ldamodel.per_word_topics else row_list            
#         print(row)
#         row = sorted(row, key=lambda x: (x[1]), reverse=True)
#         # Get the Dominant topic, Perc Contribution and Keywords for each document
#         for j, (topic_num, prop_topic) in enumerate(row):
#             if j == 0:  # => dominant topic
#                 wp = ldamodel.show_topic(topic_num)
#                 topic_keywords = ", ".join([word for word, prop in wp])
#                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num) + 1, 
#                                                                   round(prop_topic,4), 
#                                                                   topic_keywords]), ignore_index=True)
#             else:
#                 break
#     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

#     # Add original text to the end of the output
# #     contents = pd.Series(texts)
#     sent_topics_df = pd.concat([listings_df,sent_topics_df], axis=1, sort=False)
#     return(sent_topics_df)

## DataFrame with scores of all topics

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=lst_texts):
    # Init output
    sent_topics_df = list()
    listings_df.reset_index(inplace=True)
    
    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        sent_topics_df.append(row)
        
    sent_topics_df = pd.DataFrame(sent_topics_df)
    sent_topics_df.columns = ['Dominant_Topic_1', 'Dominant_Topic_2', 'Dominant_Topic_3']
    sent_topics_df = pd.concat([listings_df['id'],sent_topics_df], axis=1, sort=False)
    
#     sent_topics_df.rename(columns={"id": "listing_id"},inplace=True)
    
#     sent_topics_df.set_index('listing_id',inplace=True)
    
    df_n_cols = sent_topics_df.shape[1]
    
    return sent_topics_df, df_n_cols

In [None]:
sent_topics_df, df_n_cols = format_topics_sentences(ldamodel=models[1], corpus=corpus, texts=lst_texts)

In [None]:
sent_topics_df.head()

In [None]:
for i in range(1,df_n_cols):
    top_col = 'topic' + str(i)
    score_col = 'score_dom_topic_' + str(i)
    
    sent_topics_df[top_col] = pd.DataFrame(sent_topics_df.iloc[:,i].tolist(), index=sent_topics_df.index)[0]
    sent_topics_df[score_col] = pd.DataFrame(sent_topics_df.iloc[:,i].tolist(), index=sent_topics_df.index)[1]

In [None]:
sent_topics_df.head()

In [None]:
cols_2_drop = ['Dominant_Topic_1', 'Dominant_Topic_2','Dominant_Topic_3']
sent_topics_df.drop(columns=cols_2_drop, inplace=True)

In [None]:
sent_topics_df.to_csv(os.path.join(path,'topics_with_scores.csv'))

In [None]:
sent_topics_df = pd.read_csv(os.path.join(path,'topics_with_scores.csv'),index_col=0)

In [None]:
sent_topics_df.head()

In [None]:
# sent_main_topic_df = sent_topics_df[['id','topic1','score_dom_topic_1']]
sent_topics_df.rename(columns={"topic1": "winner_topic",
                                   "score_dom_topic_1":"winner_topic_score",
                                  "topic2": "second_place_topic",
                                   "score_dom_topic_2":"second_topic_score",
                                  "topic3": "third_place_topic",
                                   "score_dom_topic_3":"third_topic_score"},inplace=True)

# sent_main_topic_df.to_csv(os.path.join(path, 'winner_topic.csv'))

In [None]:
sent_topics_df = sent_topics_df.iloc[:,1:]

### Visualize the topics

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(models[1], corpus, id2word)
vis

## Concatenating the Dominant Topics with the rest of the Features for Regression

In [None]:
model_cols_df = pd.read_csv('../data/new-york-city-airbnb-open-data/model_columns_listings.csv',index_col = 0)

In [None]:
df_model = pd.merge(left=model_cols_df, right=sent_topics_df, how='left', on='id')

In [None]:
df_model = df_model.dropna()

In [None]:
features=list()
for col in df_model.columns:
    features.append(col)

In [None]:
features.remove('price')

In [None]:
features.remove('id')

In [None]:
df_model.shape

In [None]:

df = df_model.groupby('winner_topic')['winner_topic'].size()
print(df)
df.plot(kind='bar')

In [None]:
df_model.shape

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

df_3 = df_model.loc[:,['review_scores_rating','winner_topic', 'winner_topic_score', 'second_place_topic',
                       'second_topic_score', 'third_place_topic', 'third_topic_score','price']]

sns.pairplot(df_3,height=2)

## Split train - test sets

In [None]:
X = df_model.iloc[:, 1:]
X = X.loc[:, X.columns != 'price']
y = df_model.price

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape, len(features)

In [None]:
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge, LinearRegression
from yellowbrick.features import FeatureImportances

regressors = {
#     "support vector machine": SVR(),
#     "multilayer perceptron": MLPRegressor(),
#     "nearest neighbors": KNeighborsRegressor(),
#     "bayesian ridge": BayesianRidge(),
#     "linear regression": LinearRegression(),
    "random forest": RandomForestRegressor(),
}

for _, regressor in regressors.items():
    visualizer = ResidualsPlot(regressor)
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.show()
    
    viz = FeatureImportances(regressor, labels=features, size=(1080, 1080))

    viz.fit(X_train, y_train)
    # Note: the FeatureImportances visualizer is a model visualizer,
    # not a feature visualizer, so it doesn't have a transform method!
    viz.show()

In [None]:
# top_features =['room_type_Entire home/apt',
# 'bathrooms',
# 'neighbourhood_group_cleansed_Manhattan',
# 'bedrooms',
# 'host_since',
# 'reviews_per_month',
# 'accommodates',
# 'third_topic_score',
# 'last_review_days_ago',
# 'amenities_count',
# 'second_topic+score',
# 'number_of_reviews',
# 'review_scores_rating',
# 'guest_included',
# 'number_of_reviews_ltm',
# 'review_scores_location',
# 'beds',
# 'apt_yes_no',
# 'review_scores_value',
# 'review_scores_cleanliness']

top_features =['bathrooms','bedrooms','reviews_per_month','price','third_topic_score']

df_4 = df_model.loc[:,top_features]

In [None]:
sns.pairplot(df_4,height=3)