# Topic Modeling Full Overviews

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
import glob as glob

In [3]:
def load_sample_data(country, city):
    directory = 'data/' + country + '/' + city + '/'
    csv_files = [file_name for file_name in glob.glob(directory + '*') if file_name[-4:] == '.csv']
    
    for file_loc in csv_files:
        file_name = file_loc.split('/')[3]
        
        if file_name == 'listings_sample.csv':
            #Read Listing Sample
            listing_sample = pd.read_csv(file_loc)
            
        elif file_name == 'reviews_sample.csv':
            #Read Review Sample
            reviews_sample = pd.read_csv(file_loc)
            
        elif file_name == 'neighbourhoods_sample.csv':
            #Read Neighborhoods
            neighbourhoods_sample = pd.read_csv(file_loc)
            
    return [listing_sample, reviews_sample, neighbourhoods_sample]


def load_full_data(country, city):
    directory = 'data/' + country + '/' + city + '/'
    csv_files = [file_name for file_name in glob.glob(directory + '*') if file_name[-4:] == '.csv']
    
    for file_loc in csv_files:
        file_name = file_loc.split('/')[3]
        
        if file_name == 'listings_full.csv':
            #Read Listings
            listings_full = pd.read_csv(file_loc).drop(columns=['Unnamed: 0'])
            
        elif file_name == 'reviews_full.csv':
            #Read Reviews
            reviews_full = pd.read_csv(file_loc).drop(columns=['Unnamed: 0'])
            
        elif file_name == 'calendar_full.csv':
            #Read Calendar
            calendar_full = pd.read_csv(file_loc).drop(columns=['Unnamed: 0'])
            
    return [listings_full, reviews_full, calendar_full]

In [5]:
listings_sf = pd.read_csv('../sf/listings_detailed.csv')

In [6]:
def clean_listing_full(listing_full_data):
    """Cleans listing_full.csv data"""
    # Input Data
    df = listing_full_data
    
    # String to Datetime
    df['last_scraped'] = pd.to_datetime(df['last_scraped'])
    df['host_since'] = pd.to_datetime(df['host_since'])
    df['calendar_last_scraped'] = pd.to_datetime(df['calendar_last_scraped'])
    df['first_review'] = pd.to_datetime(df['first_review'])
    df['last_review'] = pd.to_datetime(df['last_review'])
    
    # String to Numeric
    df['host_response_rate'] = pd.to_numeric(df['host_response_rate'].str[:-1]) / 100
    df['price'] = pd.to_numeric(df['price'].str[1:].str.replace(',',''))
    df['weekly_price'] = pd.to_numeric(df['weekly_price'].str[1:].str.replace(',',''))
    df['monthly_price'] = pd.to_numeric(df['monthly_price'].str[1:].str.replace(',',''))
    df['security_deposit'] = pd.to_numeric(df['security_deposit'].str[1:].str.replace(',',''))
    df['cleaning_fee'] = pd.to_numeric(df['cleaning_fee'].str[1:].str.replace(',',''))
    df['extra_people'] = pd.to_numeric(df['extra_people'].str[1:].str.replace(',',''))

    # t/f to Numeric
    df['host_is_superhost'] = (df['host_is_superhost'] == "t").astype(int)
    df['host_has_profile_pic'] = (df['host_has_profile_pic'] == "t").astype(int)
    df['host_identity_verified'] = (df['host_identity_verified'] == "t").astype(int)
    df['is_location_exact'] = (df['is_location_exact'] == "t").astype(int)
    df['has_availability'] = (df['has_availability'] == "t").astype(int)
    df['requires_license'] = (df['requires_license'] == "t").astype(int)
    df['instant_bookable'] = (df['instant_bookable'] == "t").astype(int)
    df['is_business_travel_ready'] = (df['is_business_travel_ready'] == "t").astype(int)
    df['require_guest_profile_picture'] = (df['require_guest_profile_picture'] == "t").astype(int)
    df['require_guest_phone_verification'] = (df['require_guest_phone_verification'] == "t").astype(int)
    
    return df

In [7]:
clean_listings_sf = clean_listing_full(listings_sf)

In [8]:
listings_detailed = clean_listings_sf.copy()

ID = list(listings_detailed.iloc[:,:2].columns)

ABOUT_COLS = list(listings_detailed.iloc[:,3:15].columns)

PICS_COLS = list(listings_detailed.iloc[:,15:19].columns)

HOST_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('host')])

NEIGHBORHOOD_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('neighbourhood')])

LOCATION_COLS = list(listings_detailed.iloc[:,37:51].columns)

PROPERTY_COLS = list(listings_detailed.iloc[:,51:60].columns)

PRICE_COLS = list(listings_detailed.iloc[:,60:67].columns)

NIGHTS_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('mum')])

AVAILABILITY_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('availability')])

REVIEW_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('review')])

SCRAPING_COLS = ['scrape_id','calendar_updated','calendar_last_scraped']

ELSE_COLS = ['requires_license', 'license', 'jurisdiction_names', 'instant_bookable',\
             'is_business_travel_ready', 'cancellation_policy', 'require_guest_profile_picture',\
             'require_guest_phone_verification']

## Neighborhood Info

In [9]:
neighborhood_info = clean_listings_sf[ID + NEIGHBORHOOD_COLS + ABOUT_COLS]

In [10]:
neighborhood_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7198 entries, 0 to 7197
Data columns (total 18 columns):
id                              7198 non-null int64
listing_url                     7198 non-null object
host_neighbourhood              6559 non-null object
neighbourhood                   6660 non-null object
neighbourhood_cleansed          7198 non-null object
neighbourhood_group_cleansed    0 non-null float64
last_scraped                    7198 non-null datetime64[ns]
name                            7198 non-null object
summary                         7000 non-null object
space                           6109 non-null object
description                     7183 non-null object
experiences_offered             7198 non-null object
neighborhood_overview           5310 non-null object
notes                           4486 non-null object
transit                         5238 non-null object
access                          4794 non-null object
interaction                     4931 non

In [11]:
neighborhood_overviews = neighborhood_info[['id','neighbourhood_cleansed','neighborhood_overview']].dropna()

## Clean & Tokenize Overview Text

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Get Stopwords
stop_words = set(stopwords.words('english'))

# Get Puncuations
punctuations = set(string.punctuation)

def clean_text(doc):
    
    # remove all ascii
    doc = re.sub(r'[^\x00-\x7F]+',' ', doc)

    # Tokenize, Lemmatize, and Remove Stopwords
    tokens = [lemmatizer.lemmatize(word).lower() for word in nltk.word_tokenize(doc) if word.lower() not in set(stop_words | punctuations)]
    
    return tokens

In [13]:
# Clean & Tokenize Overviews
neighborhood_overviews['tokens'] = neighborhood_overviews['neighborhood_overview'].apply(clean_text)

In [14]:
neighborhood_overviews['clean_overviews'] = neighborhood_overviews['tokens'].map(lambda x: ' '.join(x))

# Topic Modeling

In [15]:
token_texts = list(neighborhood_overviews['tokens'].values)

In [16]:
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary = Dictionary(token_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in token_texts]

  utils.PersistentlyDeprecated2018,


In [17]:
from gensim.models.ldamulticore import LdaMulticore
import time
ldam = LdaMulticore

num_topics = 50
num_words = 10
passes = 50

# Get Start Time
start_time = time.time()

# LDA Model
ldam_model = ldam(common_corpus, num_topics=num_topics, id2word=common_dictionary, passes=passes)
model_end_time = time.time() # Model End Time

# LDA Results
results = ldam_model.print_topics(num_topics=num_topics, num_words=num_words)
result_time = time.time() # Results Time

In [18]:
def display_results(results):
    for index, results in results:
        print(str(index) + ': ' + str(', '.join(results.split('"')[1::2])))

In [19]:
display_results(results)

0: duboce, triangle, walk, best, 're, neighborhood, mission, san, francisco, haight
1: block, shop, coffee, street, restaurant, polk, store, cafe, bar, town
2: walking, distance, within, restaurant, great, bar, shop, right, everything, cafe
3: store, restaurant, grocery, 's, block, within, market, distance, many, walking
4: walk, city, minute, restaurant, block, shop, want, bus, get, line
5: minute, walk, min, 10, away, 5, neighborhood, downtown, park, 15
6: block, street, market, 's, store, mission, restaurant, park, shop, neighborhood
7: cathedral, grace, hotel, nob, walk, great, city, huntington, hill, neighborhood
8: gate, golden, park, beach, museum, block, ocean, restaurant, neighborhood, garden
9: mission, neighborhood, minute, coffee, park, restaurant, 's, transit, best, public
10: san, francisco, district, walk, park, restaurant, take, shop, view, spot
11: park, haight, restaurant, block, alamo, square, golden, gate, nopa, divisadero
12: mission, san, francisco, neighborhood, 

In [20]:
ldam_model.save('../models/full_overviews_50topics_10words_50passes.model')

# By Neighborhood

In [23]:
tokens_by_neighborhood = neighborhood_overviews.groupby('neighbourhood_cleansed')[['tokens']].apply(sum)

# Topic Modeling

In [26]:
token_texts_by_neighborhood = list(tokens_by_neighborhood['tokens'].values)

In [27]:
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary_by_neighborhood = Dictionary(token_texts_by_neighborhood)
common_corpus_by_neighborhood = [common_dictionary_by_neighborhood.doc2bow(text) for text in token_texts_by_neighborhood]

In [28]:
from gensim.models.ldamulticore import LdaMulticore
import time
ldam = LdaMulticore

num_topics = 50
num_words = 10
passes = 50

# Get Start Time
start_time = time.time()

# LDA Model
ldam_model_by_neighborhood = ldam(common_corpus_by_neighborhood, num_topics=num_topics, id2word=common_dictionary_by_neighborhood, passes=passes)
model_end_time = time.time() # Model End Time

# LDA Results
results_by_neighborhood = ldam_model.print_topics(num_topics=num_topics, num_words=num_words)
result_time = time.time() # Results Time

In [29]:
display_results(results)

0: duboce, triangle, walk, best, 're, neighborhood, mission, san, francisco, haight
1: block, shop, coffee, street, restaurant, polk, store, cafe, bar, town
2: walking, distance, within, restaurant, great, bar, shop, right, everything, cafe
3: store, restaurant, grocery, 's, block, within, market, distance, many, walking
4: walk, city, minute, restaurant, block, shop, want, bus, get, line
5: minute, walk, min, 10, away, 5, neighborhood, downtown, park, 15
6: block, street, market, 's, store, mission, restaurant, park, shop, neighborhood
7: cathedral, grace, hotel, nob, walk, great, city, huntington, hill, neighborhood
8: gate, golden, park, beach, museum, block, ocean, restaurant, neighborhood, garden
9: mission, neighborhood, minute, coffee, park, restaurant, 's, transit, best, public
10: san, francisco, district, walk, park, restaurant, take, shop, view, spot
11: park, haight, restaurant, block, alamo, square, golden, gate, nopa, divisadero
12: mission, san, francisco, neighborhood, 

In [30]:
ldam_model_by_neighborhood.save('../models/full_overviews_by_neighborhood_50topics_10words_50passes.model')