# Topic Modelling Neighborhood Overview: Nouns

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
import glob as glob

In [6]:
def load_sample_data(country, city):
    directory = 'data/' + country + '/' + city + '/'
    csv_files = [file_name for file_name in glob.glob(directory + '*') if file_name[-4:] == '.csv']
    
    for file_loc in csv_files:
        file_name = file_loc.split('/')[3]
        
        if file_name == 'listings_sample.csv':
            #Read Listing Sample
            listing_sample = pd.read_csv(file_loc)
            
        elif file_name == 'reviews_sample.csv':
            #Read Review Sample
            reviews_sample = pd.read_csv(file_loc)
            
        elif file_name == 'neighbourhoods_sample.csv':
            #Read Neighborhoods
            neighbourhoods_sample = pd.read_csv(file_loc)
            
    return [listing_sample, reviews_sample, neighbourhoods_sample]


def load_full_data(country, city):
    directory = 'data/' + country + '/' + city + '/'
    csv_files = [file_name for file_name in glob.glob(directory + '*') if file_name[-4:] == '.csv']
    
    for file_loc in csv_files:
        file_name = file_loc.split('/')[3]
        
        if file_name == 'listings_full.csv':
            #Read Listings
            listings_full = pd.read_csv(file_loc).drop(columns=['Unnamed: 0'])
            
        elif file_name == 'reviews_full.csv':
            #Read Reviews
            reviews_full = pd.read_csv(file_loc).drop(columns=['Unnamed: 0'])
            
        elif file_name == 'calendar_full.csv':
            #Read Calendar
            calendar_full = pd.read_csv(file_loc).drop(columns=['Unnamed: 0'])
            
    return [listings_full, reviews_full, calendar_full]

In [8]:
listings_sf = pd.read_csv('../sf/listings_detailed.csv')

In [9]:
def clean_listing_full(listing_full_data):
    """Cleans listing_full.csv data"""
    # Input Data
    df = listing_full_data
    
    # String to Datetime
    df['last_scraped'] = pd.to_datetime(df['last_scraped'])
    df['host_since'] = pd.to_datetime(df['host_since'])
    df['calendar_last_scraped'] = pd.to_datetime(df['calendar_last_scraped'])
    df['first_review'] = pd.to_datetime(df['first_review'])
    df['last_review'] = pd.to_datetime(df['last_review'])
    
    # String to Numeric
    df['host_response_rate'] = pd.to_numeric(df['host_response_rate'].str[:-1]) / 100
    df['price'] = pd.to_numeric(df['price'].str[1:].str.replace(',',''))
    df['weekly_price'] = pd.to_numeric(df['weekly_price'].str[1:].str.replace(',',''))
    df['monthly_price'] = pd.to_numeric(df['monthly_price'].str[1:].str.replace(',',''))
    df['security_deposit'] = pd.to_numeric(df['security_deposit'].str[1:].str.replace(',',''))
    df['cleaning_fee'] = pd.to_numeric(df['cleaning_fee'].str[1:].str.replace(',',''))
    df['extra_people'] = pd.to_numeric(df['extra_people'].str[1:].str.replace(',',''))

    # t/f to Numeric
    df['host_is_superhost'] = (df['host_is_superhost'] == "t").astype(int)
    df['host_has_profile_pic'] = (df['host_has_profile_pic'] == "t").astype(int)
    df['host_identity_verified'] = (df['host_identity_verified'] == "t").astype(int)
    df['is_location_exact'] = (df['is_location_exact'] == "t").astype(int)
    df['has_availability'] = (df['has_availability'] == "t").astype(int)
    df['requires_license'] = (df['requires_license'] == "t").astype(int)
    df['instant_bookable'] = (df['instant_bookable'] == "t").astype(int)
    df['is_business_travel_ready'] = (df['is_business_travel_ready'] == "t").astype(int)
    df['require_guest_profile_picture'] = (df['require_guest_profile_picture'] == "t").astype(int)
    df['require_guest_phone_verification'] = (df['require_guest_phone_verification'] == "t").astype(int)
    
    return df

In [10]:
clean_listings_sf = clean_listing_full(listings_sf)

In [11]:
listings_detailed = clean_listings_sf.copy()

ID = list(listings_detailed.iloc[:,:2].columns)

ABOUT_COLS = list(listings_detailed.iloc[:,3:15].columns)

PICS_COLS = list(listings_detailed.iloc[:,15:19].columns)

HOST_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('host')])

NEIGHBORHOOD_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('neighbourhood')])

LOCATION_COLS = list(listings_detailed.iloc[:,37:51].columns)

PROPERTY_COLS = list(listings_detailed.iloc[:,51:60].columns)

PRICE_COLS = list(listings_detailed.iloc[:,60:67].columns)

NIGHTS_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('mum')])

AVAILABILITY_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('availability')])

REVIEW_COLS = list(listings_detailed.columns[listings_detailed.columns.str.contains('review')])

SCRAPING_COLS = ['scrape_id','calendar_updated','calendar_last_scraped']

ELSE_COLS = ['requires_license', 'license', 'jurisdiction_names', 'instant_bookable',\
             'is_business_travel_ready', 'cancellation_policy', 'require_guest_profile_picture',\
             'require_guest_phone_verification']

## Neighborhood Info

In [12]:
neighborhood_info = clean_listings_sf[ID + NEIGHBORHOOD_COLS + ABOUT_COLS]

In [13]:
neighborhood_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7198 entries, 0 to 7197
Data columns (total 18 columns):
id                              7198 non-null int64
listing_url                     7198 non-null object
host_neighbourhood              6559 non-null object
neighbourhood                   6660 non-null object
neighbourhood_cleansed          7198 non-null object
neighbourhood_group_cleansed    0 non-null float64
last_scraped                    7198 non-null datetime64[ns]
name                            7198 non-null object
summary                         7000 non-null object
space                           6109 non-null object
description                     7183 non-null object
experiences_offered             7198 non-null object
neighborhood_overview           5310 non-null object
notes                           4486 non-null object
transit                         5238 non-null object
access                          4794 non-null object
interaction                     4931 non

In [14]:
neighborhood_overviews = neighborhood_info[['id','neighbourhood_cleansed','neighborhood_overview']].dropna()

## Clean & Tokenize Overview Text

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Get Stopwords
stop_words = set(stopwords.words('english'))

# Get Puncuations
punctuations = set(string.punctuation)

def clean_text(doc):
    
    # remove all ascii
    doc = re.sub(r'[^\x00-\x7F]+',' ', doc)

    # Tokenize, Lemmatize, and Remove Stopwords
    tokens = [lemmatizer.lemmatize(word).lower() for word in nltk.word_tokenize(doc) if word.lower() not in set(stop_words | punctuations)]
    
    return tokens

In [16]:
# Clean & Tokenize Overviews
neighborhood_overviews['tokens'] = neighborhood_overviews['neighborhood_overview'].apply(clean_text)

In [17]:
neighborhood_overviews['clean_overviews'] = neighborhood_overviews['tokens'].map(lambda x: ' '.join(x))

## Get POS

In [18]:
def get_pos_sentences(data):
    # Tokenize each sentence into words: token_sentences
    token_sentences = [nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', sent)) for sent in data]

    # Tag each tokenized sentence into parts of speech: pos_sentences
    pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences]
    return pos_sentences

In [19]:
# Initialize TextExtraction with DS Skills Listing
pos_overviews_neighborhood = get_pos_sentences(neighborhood_overviews['neighborhood_overview'])

## Nouns

In [48]:
def get_nouns(pos_sentences):
    """Return Noun List"""
    # Noun Codes
    noun_code = ['NN','NNS','NNP','NNPS']
    
    # Get Nouns
    noun_list = [[word[0].lower() for word in sent if word[1] in noun_code] for sent in pos_sentences]

    #Lemmatize
    lemmatizer = WordNetLemmatizer()
    noun_lem_list = [[lemmatizer.lemmatize(noun) for noun in noun_sent] for noun_sent in noun_list]
        
    return noun_lem_list

In [49]:
nouns_overviews_neighborhood = get_nouns(pos_overviews_neighborhood)

In [50]:
neighborhood_overviews = neighborhood_overviews.reset_index(drop=True)

In [51]:
neighborhood_overviews['nouns'] = pd.Series(nouns_overviews_neighborhood)

# Topic Modeling

In [52]:
token_texts_nouns = list(neighborhood_overviews['nouns'].values)

In [53]:
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary = Dictionary(token_texts_nouns)
common_corpus = [common_dictionary.doc2bow(text) for text in token_texts_nouns]

In [107]:
from gensim.models.ldamulticore import LdaMulticore
import time
ldam = LdaMulticore

num_topics = 50
num_words = 5
passes = 50

# Get Start Time
start_time = time.time()

# LDA Model
ldam_model = ldam(common_corpus, num_topics=num_topics, id2word=common_dictionary, passes=passes)
model_end_time = time.time() # Model End Time

# LDA Results
results = ldam_model.print_topics(num_topics=num_topics, num_words=num_words)
result_time = time.time() # Results Time

In [108]:
ldam_model.save('../models/ldam_neighborhood_overviews_50topics_5words_50passes_nouns.model')

In [109]:
def display_results(results):
    for index, results in results:
        print(str(index) + ': ' + str(', '.join(results.split('"')[1::2])))

In [110]:
display_results(results)

0: store, restaurant, grocery, shop, block
1: street, hill, north, beach, wharf
2: super, entrance, grocer, safety, neighborhood
3: block, street, park, mission, dolores
4: pacific, height, fillmore, francisco, san
5: san, francisco, mission, street, neighborhood
6: bernal, height, park, neighborhood, view
7: neighborhood, lot, people, family, bayview
8: park, glen, canyon, neighborhood, village
9: peak, twin, view, city, hill
10: castro, neighborhood, francisco, san, restaurant
11: center, city, san, francisco, head
12: neighborhood, san, francisco, park, airport
13: court, walk, tennis, park, playground
14: airbnb, neighborhood, district, mission, restaurant
15: san, francisco, neighborhood, life, city
16: district, minute, financial, embarcadero, beach
17: min, street, walk, polk, sf
18: francisco, san, city, square, park
19: access, bus, neighborhood, transportation, line
20: block, restaurant, bar, door, corner
21: park, haight, neighborhood, haight-ashbury, francisco
22: san, fra

In [111]:
def display_results_no_duplicates(results):
    all_lists = []
    for index, result in results:
        all_lists = all_lists + result.split('"')[1::2]
    
    # Get Counts of each word
    counts = pd.Series(all_lists).value_counts()
    no_duplicates = counts[counts == 1].index
    
    for index, result in results:
        print(str(index) + ': ' + str(', '.join([word for word in result.split('"')[1::2] if word in no_duplicates])))

In [112]:
display_results_no_duplicates(results)

0: store, grocery
1: north
2: super, entrance, grocer, safety
3: dolores
4: pacific, fillmore
5: 
6: bernal
7: lot, people, family, bayview
8: glen, canyon, village
9: peak, twin
10: 
11: head
12: airport
13: court, tennis, playground
14: airbnb
15: 
16: financial, embarcadero
17: min, polk
18: 
19: access, bus, transportation, line
20: door, corner
21: haight-ashbury
22: west, portal
23: 
24: mix
25: hidden, book, place
26: car
27: bay, t
28: mi, parking, couple
29: 
30: potrero
31: everything
32: 
33: night, note
34: golden
35: house
36: soma, museum
37: apartment
38: ocean, sunset, zoo
39: building, attraction, tenderloin, ferry
40: 
41: 
42: alamo
43: marina, bridge
44: cafe
45: food, thai, sushi, mexican
46: hayes, hall
47: union, great, theater
48: noe
49: balboa, geary, presidio, clement


# By Neighborhood

In [58]:
tokens_by_neighborhood = neighborhood_overviews.groupby('neighbourhood_cleansed')[['nouns']].apply(sum)

# Topic Modeling

In [59]:
token_texts_by_neighborhood = list(tokens_by_neighborhood['nouns'].values)

In [60]:
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary_by_neighborhood = Dictionary(token_texts_by_neighborhood)
common_corpus_by_neighborhood = [common_dictionary_by_neighborhood.doc2bow(text) for text in token_texts_by_neighborhood]

In [61]:
from gensim.models.ldamulticore import LdaMulticore
import time
ldam = LdaMulticore

num_topics = 50
num_words = 10
passes = 50

# Get Start Time
start_time = time.time()

# LDA Model
ldam_model_by_neighborhood = ldam(common_corpus_by_neighborhood, num_topics=num_topics, id2word=common_dictionary_by_neighborhood, passes=passes)
model_end_time = time.time() # Model End Time

# LDA Results
results_by_neighborhood = ldam_model_by_neighborhood.print_topics(num_topics=num_topics, num_words=num_words)
result_time = time.time() # Results Time

In [62]:
ldam_model_by_neighborhood.save('../models/ldam_overview_by_neighborhood_50topics_10words_50passes_verbs.model')

In [63]:
display_results(results_by_neighborhood)

0: neighborhood, restaurant, san, block, park, francisco, street, city, gate, shop
1: noe, valley, neighborhood, restaurant, street, shop, walk, block, mission, park
2: neighborhood, restaurant, street, san, block, city, francisco, bar, park, minute
3: francisco, neighborhood, restaurant, block, park, city, san, street, shop, mission
4: park, gate, neighborhood, golden, restaurant, sunset, inner, block, shop, walk
5: restaurant, francisco, block, street, neighborhood, san, city, park, mission, district
6: neighborhood, san, street, block, francisco, park, restaurant, city, shop, district
7: mission, neighborhood, restaurant, street, block, city, san, park, francisco, bernal
8: restaurant, neighborhood, street, park, san, francisco, city, block, hayes, valley
9: neighborhood, restaurant, street, block, city, park, san, francisco, distance, square
10: park, glen, neighborhood, san, restaurant, francisco, bart, store, mission, city
11: park, neighborhood, restaurant, san, city, block, fra

In [106]:
display_results_no_duplicates(results_by_neighborhood)

0: 
1: noe
2: 
3: 
4: sunset, inner
5: 
6: 
7: bernal
8: hayes
9: 
10: glen, bart, store
11: 
12: 
13: wharf, chinatown, fisherman
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: soma, head, market, bay, museum
25: 
26: marina, bridge
27: wilemina
28: 
29: mosaic, trail, joint, class, clement, biking, nightlife, world, music
30: union, car, building
31: dolores
32: pacific, fillmore
33: 
34: 
35: 
36: 
37: potrero
38: 
39: 
40: 
41: 
42: 
43: haight, cole
44: sf, zoo, stonestown, mall, university, merced, state
45: 
46: 
47: 
48: 
49: 
