In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
import glob as glob

In [6]:
def load_sample_data(country, city):
    directory = 'data/' + country + '/' + city + '/'
    csv_files = [file_name for file_name in glob.glob(directory + '*') if file_name[-4:] == '.csv']
    
    for file_loc in csv_files:
        file_name = file_loc.split('/')[3]
        
        if file_name == 'listings_sample.csv':
            #Read Listing Sample
            listing_sample = pd.read_csv(file_loc)
            
        elif file_name == 'reviews_sample.csv':
            #Read Review Sample
            reviews_sample = pd.read_csv(file_loc)
            
        elif file_name == 'neighbourhoods_sample.csv':
            #Read Neighborhoods
            neighbourhoods_sample = pd.read_csv(file_loc)
            
    return [listing_sample, reviews_sample, neighbourhoods_sample]


def load_full_data(country, city):
    directory = 'data/' + country + '/' + city + '/'
    csv_files = [file_name for file_name in glob.glob(directory + '*') if file_name[-4:] == '.csv']
    
    for file_loc in csv_files:
        file_name = file_loc.split('/')[3]
        
        if file_name == 'listings_full.csv':
            #Read Listings
            listings_full = pd.read_csv(file_loc).drop(columns=['Unnamed: 0'])
            
        elif file_name == 'reviews_full.csv':
            #Read Reviews
            reviews_full = pd.read_csv(file_loc).drop(columns=['Unnamed: 0'])
            
        elif file_name == 'calendar_full.csv':
            #Read Calendar
            calendar_full = pd.read_csv(file_loc).drop(columns=['Unnamed: 0'])
            
    return [listings_full, reviews_full, calendar_full]

In [8]:
listings_sf = pd.read_csv('../sf/listings_detailed.csv')

In [9]:
def clean_listing_full(listing_full_data):
    """Cleans listing_full.csv data"""
    # Input Data
    df = listing_full_data
    
    # String to Datetime
    df['last_scraped'] = pd.to_datetime(df['last_scraped'])
    df['host_since'] = pd.to_datetime(df['host_since'])
    df['calendar_last_scraped'] = pd.to_datetime(df['calendar_last_scraped'])
    df['first_review'] = pd.to_datetime(df['first_review'])
    df['last_review'] = pd.to_datetime(df['last_review'])
    
    # String to Numeric
    df['host_response_rate'] = pd.to_numeric(df['host_response_rate'].str[:-1]) / 100
    df['price'] = pd.to_numeric(df['price'].str[1:].str.replace(',',''))
    df['weekly_price'] = pd.to_numeric(df['weekly_price'].str[1:].str.replace(',',''))
    df['monthly_price'] = pd.to_numeric(df['monthly_price'].str[1:].str.replace(',',''))
    df['security_deposit'] = pd.to_numeric(df['security_deposit'].str[1:].str.replace(',',''))
    df['cleaning_fee'] = pd.to_numeric(df['cleaning_fee'].str[1:].str.replace(',',''))
    df['extra_people'] = pd.to_numeric(df['extra_people'].str[1:].str.replace(',',''))

    # t/f to Numeric
    df['host_is_superhost'] = (df['host_is_superhost'] == "t").astype(int)
    df['host_has_profile_pic'] = (df['host_has_profile_pic'] == "t").astype(int)
    df['host_identity_verified'] = (df['host_identity_verified'] == "t").astype(int)
    df['is_location_exact'] = (df['is_location_exact'] == "t").astype(int)
    df['has_availability'] = (df['has_availability'] == "t").astype(int)
    df['requires_license'] = (df['requires_license'] == "t").astype(int)
    df['instant_bookable'] = (df['instant_bookable'] == "t").astype(int)
    df['is_business_travel_ready'] = (df['is_business_travel_ready'] == "t").astype(int)
    df['require_guest_profile_picture'] = (df['require_guest_profile_picture'] == "t").astype(int)
    df['require_guest_phone_verification'] = (df['require_guest_phone_verification'] == "t").astype(int)
    
    return df

In [10]:
clean_listings_sf = clean_listing_full(listings_sf)

In [11]:
listings_detailed = clean_listings_sf.copy()

ID = list(listings_detailed.iloc[:,:2].columns)

ABOUT_COLS = list(listings_detailed.iloc[:,3:15].columns)

PICS_COLS = list(listings_detailed.iloc[:,15:19].columns)

HOST_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('host')])

NEIGHBORHOOD_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('neighbourhood')])

LOCATION_COLS = list(listings_detailed.iloc[:,37:51].columns)

PROPERTY_COLS = list(listings_detailed.iloc[:,51:60].columns)

PRICE_COLS = list(listings_detailed.iloc[:,60:67].columns)

NIGHTS_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('mum')])

AVAILABILITY_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('availability')])

REVIEW_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('review')])

SCRAPING_COLS = ['scrape_id','calendar_updated','calendar_last_scraped']

ELSE_COLS = ['requires_license', 'license', 'jurisdiction_names', 'instant_bookable',\
             'is_business_travel_ready', 'cancellation_policy', 'require_guest_profile_picture',\
             'require_guest_phone_verification']

## Neighborhood Info

In [12]:
neighborhood_info = clean_listings_sf[ID + NEIGHBORHOOD_COLS + ABOUT_COLS]

In [13]:
neighborhood_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7198 entries, 0 to 7197
Data columns (total 18 columns):
id                              7198 non-null int64
listing_url                     7198 non-null object
host_neighbourhood              6559 non-null object
neighbourhood                   6660 non-null object
neighbourhood_cleansed          7198 non-null object
neighbourhood_group_cleansed    0 non-null float64
last_scraped                    7198 non-null datetime64[ns]
name                            7198 non-null object
summary                         7000 non-null object
space                           6109 non-null object
description                     7183 non-null object
experiences_offered             7198 non-null object
neighborhood_overview           5310 non-null object
notes                           4486 non-null object
transit                         5238 non-null object
access                          4794 non-null object
interaction                     4931 non

In [14]:
neighborhood_overviews = neighborhood_info[['id','neighbourhood_cleansed','neighborhood_overview']].dropna()

## Clean & Tokenize Overview Text

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Get Stopwords
stop_words = set(stopwords.words('english'))

# Get Puncuations
punctuations = set(string.punctuation)

def clean_text(doc):
    
    # remove all ascii
    doc = re.sub(r'[^\x00-\x7F]+',' ', doc)

    # Tokenize, Lemmatize, and Remove Stopwords
    tokens = [lemmatizer.lemmatize(word).lower() for word in nltk.word_tokenize(doc) if word.lower() not in set(stop_words | punctuations)]
    
    return tokens

In [16]:
# Clean & Tokenize Overviews
neighborhood_overviews['tokens'] = neighborhood_overviews['neighborhood_overview'].apply(clean_text)

In [17]:
neighborhood_overviews['clean_overviews'] = neighborhood_overviews['tokens'].map(lambda x: ' '.join(x))

## Get POS

In [18]:
def get_pos_sentences(data):
    # Tokenize each sentence into words: token_sentences
    token_sentences = [nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', sent)) for sent in data]

    # Tag each tokenized sentence into parts of speech: pos_sentences
    pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences]
    return pos_sentences

In [19]:
# Initialize TextExtraction with DS Skills Listing
pos_overviews_neighborhood = get_pos_sentences(neighborhood_overviews['neighborhood_overview'])

## Adjectives

In [20]:
def get_adjectives(pos_sentences):
    """Adjective List"""
    # Codes
    adj_code = ['JJ','JJR','JJS']

    # Get List of Adjectives
    adj_list = [[word[0].lower() for word in sent if word[1] in adj_code] for sent in pos_sentences]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    adj_lem_list = [[lemmatizer.lemmatize(adj, 'a') for adj in adj_sent] for adj_sent in adj_list]
        
    return adj_lem_list

In [21]:
adj_overviews_neighborhood = get_adjectives(pos_overviews_neighborhood)

In [27]:
neighborhood_overviews = neighborhood_overviews.reset_index(drop=True)

In [28]:
neighborhood_overviews['adjectives'] = pd.Series(adj_overviews_neighborhood)

# Topic Modeling

In [29]:
token_texts = list(neighborhood_overviews['adjectives'].values)

In [30]:
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary = Dictionary(token_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in token_texts]

In [47]:
from gensim.models.ldamulticore import LdaMulticore
import time
ldam = LdaMulticore

num_topics = 50
num_words = 10
passes = 50

# Get Start Time
start_time = time.time()

# LDA Model
ldam_model = ldam(common_corpus, num_topics=num_topics, id2word=common_dictionary, passes=passes)
model_end_time = time.time() # Model End Time

# LDA Results
results = ldam_model.print_topics(num_topics=num_topics, num_words=num_words)
result_time = time.time() # Results Time

In [50]:
ldam_model.save('../models/ldam_neighborhood_overviews_50topics_10words_50passes_adj.model')

In [48]:
def display_results(results):
    for index, results in results:
        print(str(index) + ': ' + str(', '.join(results.split('"')[1::2])))

In [49]:
display_results(results)

0: many, family-friendly, beautiful, busy, 10-minute, short, diverse, safe, resort-like, urban
1: friendly, hot, best, active, large, new, sunny, old, second, sought-after
2: nearby, such, many, private, other, more, short, lively, geographic, difficult
3: famous, great, many, favorite, french, such, nitty-gritty, original, beautiful, deep
4: quiet, safe, residential, beautiful, many, clean, peaceful, other, best, short
5: less, many, beautiful, first, easy, retail, central, trendy, mixed, various
6: available, few, short, quiet, cheese, new, enough, good, nail, public
7: ethnic, other, few, trendy, great, large, artisan, gourmet, near, authentic
8: easy, popular, beautiful, vibrant, best, central, incredible, other, sunny, former
9: classic, quick, stunning, great, 24th, iconic, unique, few, national, other
10: tech, late, great, short, commercial, industrial, sweet, furnished, unique, several
11: central, desirable, vibrant, residential, local, best, such, memorable, painted, 5-10
12

In [51]:
def display_results_no_duplicates(results):
    all_lists = []
    for index, result in results:
        all_lists = all_lists + result.split('"')[1::2]
    
    # Get Counts of each word
    counts = pd.Series(all_lists).value_counts()
    no_duplicates = counts[counts == 1].index
    
    for index, result in results:
        print(str(index) + ': ' + str(', '.join([word for word in result.split('"')[1::2] if word in no_duplicates])))

In [52]:
display_results_no_duplicates(results)

0: family-friendly, busy, 10-minute, resort-like
1: friendly, hot, active, second, sought-after
2: private, geographic, difficult
3: nitty-gritty, deep
4: clean
5: less, first, retail, mixed, various
6: cheese, enough, nail
7: ethnic, artisan, gourmet, near
8: 
9: classic, stunning, 24th, national
10: tech, late, industrial, sweet
11: desirable, memorable, painted, 5-10
12: fantastic, russian, particular, crooked
13: modern, neighborhood
14: wonderful, half, outdoor, typical, east, peruvian
15: top, super, southern, cool, awesome, quaint, casual
16: sure, foodie, plenty
17: upscale, open, spectacular, main, prestigious
18: downtown, tree-lined, north
19: perfect, ocean, pleasant, immediate, last
20: walkable, ever-increasing
21: true, min, natural, cafe, only, ten
22: next, eclectic, huge, least, exceptional
23: whole, chinese, vietnamese
24: different, bohemian, homeless, beat, aware, comfortable
25: free, scenic, hip
26: happy
27: much
28: accessible, own, panoramic, cheap, muni
29: 

# By Neighborhood

In [38]:
tokens_by_neighborhood = neighborhood_overviews.groupby('neighbourhood_cleansed')[['adjectives']].apply(sum)

# Topic Modeling

In [40]:
token_texts_by_neighborhood = list(tokens_by_neighborhood['adjectives'].values)

In [41]:
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary_by_neighborhood = Dictionary(token_texts_by_neighborhood)
common_corpus_by_neighborhood = [common_dictionary_by_neighborhood.doc2bow(text) for text in token_texts_by_neighborhood]

In [43]:
from gensim.models.ldamulticore import LdaMulticore
import time
ldam = LdaMulticore

num_topics = 50
num_words = 10
passes = 50

# Get Start Time
start_time = time.time()

# LDA Model
ldam_model_by_neighborhood = ldam(common_corpus_by_neighborhood, num_topics=num_topics, id2word=common_dictionary_by_neighborhood, passes=passes)
model_end_time = time.time() # Model End Time

# LDA Results
results_by_neighborhood = ldam_model_by_neighborhood.print_topics(num_topics=num_topics, num_words=num_words)
result_time = time.time() # Results Time

In [46]:
ldam_model_by_neighborhood.save('../models/ldam_overview_by_neighborhood_50topics_10words_50passes_adj.model')

In [45]:
display_results(results_by_neighborhood)

0: great, many, best, easy, famous, public, central, short, few, safe
1: many, best, great, easy, public, quiet, few, vibrant, new, beautiful
2: best, great, many, other, quiet, few, safe, public, local, easy
3: best, many, great, few, local, public, vibrant, other, easy, s
4: great, many, urban, colorful, little, delicious, best, incredible, famous, quiet
5: central, public, victorian, many, trendy, famous, low, green, iconic, residential
6: many, best, great, easy, s, quiet, local, central, short, new
7: high, public, urban, many, adjacent, quiet, interested, short, easy, breath-taking
8: great, many, nitty-gritty, famous, original, such, favorite, best, other, iconic
9: great, many, small, quiet, affluent, tree-lined, short, easy, local, other
10: best, great, quiet, many, public, few, local, safe, famous, easy
11: great, many, quiet, best, safe, easy, short, beautiful, few, public
12: great, best, many, quiet, few, easy, famous, short, local, public
13: best, many, great, local, be

In [53]:
display_results_no_duplicates(results_by_neighborhood)

0: 
1: 
2: 
3: 
4: delicious
5: low
6: 
7: adjacent, interested, breath-taking
8: nitty-gritty, such, favorite
9: affluent, tree-lined
10: 
11: 
12: 
13: 
14: 
15: 
16: nice
17: elevated
18: main, friendly
19: popular, good
20: 
21: financial, high-rise
22: 
23: 
24: 
25: 
26: 
27: 
28: fine, theatrical
29: 
30: 
31: tourist-popular, above-ground, hopping
32: 
33: 
34: 
35: 
36: international
37: 
38: rich, regular, fantastic, wild, uphill
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: russian
48: close, diverse
49: 
