In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# nltk.download('stopwords')
from collections import Counter

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import pickle

In [2]:
df = pd.read_csv('data/airbnb.csv.gz')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.shape

(8533, 106)

In [4]:
df[['id','neighborhood_overview','neighbourhood_cleansed']].isnull().sum()

id                           0
neighborhood_overview     2051
neighbourhood_cleansed       0
dtype: int64

In [5]:
print(list(df.columns))

['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'monthly_price', '

# Remap Neighborhoods to Unique IDs

In [8]:
df.groupby('neighbourhood_cleansed').count()['id']

neighbourhood_cleansed
Albany Park                 94
Archer Heights               6
Armour Square               97
Ashburn                     11
Auburn Gresham              15
Austin                      73
Avalon Park                  3
Avondale                   190
Belmont Cragin              41
Beverly                      8
Bridgeport                 166
Brighton Park               27
Calumet Heights             20
Chatham                     17
Chicago Lawn                 8
Clearing                     2
Douglas                     83
Dunning                     30
East Garfield Park          90
Edgewater                  215
Edison Park                  3
Englewood                   20
Forest Glen                 11
Fuller Park                  8
Gage Park                    5
Garfield Ridge              18
Grand Boulevard            124
Greater Grand Crossing      28
Hegewisch                    4
Hermosa                     19
                          ... 
Near North Side 

# Pull Quantitative Features

In [14]:
df2 = df[['id','neighbourhood_cleansed','accommodates','price']]
df2['price_norm'] = df2.price.str.replace('[\$\,\.]', '').astype(int)/100/df2['accommodates']
df2.drop(['accommodates','price'],axis=1,inplace=True)
neighborhood_quant = df2.groupby('neighbourhood_cleansed').agg({'id':'count','price_norm':'mean'}).reset_index()
neighborhood_quant.sort_values(by='price_norm',ascending=False)
neighborhood_quant.columns=['neighborhood_id','num_listings','price_norm']
neighborhood_quant.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,neighborhood_id,num_listings,price_norm
0,Albany Park,94,28.490166
1,Archer Heights,6,50.786111
2,Armour Square,97,28.169723
3,Ashburn,11,20.187879
4,Auburn Gresham,15,25.796667


# Generate Stopwords

In [16]:
neighborhoods = df.neighbourhood_cleansed.unique()
neighborhood_words = [words for segments in neighborhoods for words in str(segments).lower().split()]
print(neighborhood_words)

['hyde', 'park', 'south', 'lawndale', 'west', 'town', 'lincoln', 'park', 'logan', 'square', 'uptown', 'near', 'north', 'side', 'avondale', 'lake', 'view', 'north', 'center', 'irving', 'park', 'portage', 'park', 'pullman', 'near', 'south', 'side', 'west', 'ridge', 'near', 'west', 'side', 'chatham', 'bridgeport', 'woodlawn', 'albany', 'park', 'kenwood', 'rogers', 'park', 'douglas', 'lincoln', 'square', 'grand', 'boulevard', 'edgewater', 'forest', 'glen', 'humboldt', 'park', 'loop', 'lower', 'west', 'side', 'north', 'lawndale', 'oakland', 'austin', 'east', 'garfield', 'park', 'beverly', 'mckinley', 'park', 'norwood', 'park', 'washington', 'park', 'morgan', 'park', 'west', 'lawn', 'armour', 'square', 'south', 'shore', 'dunning', 'englewood', 'south', 'deering', 'auburn', 'gresham', 'west', 'garfield', 'park', 'hermosa', 'hegewisch', 'south', 'chicago', 'belmont', 'cragin', 'north', 'park', 'greater', 'grand', 'crossing', 'jefferson', 'park', 'west', 'elsdon', 'ashburn', 'garfield', 'ridge'

In [54]:
sw = stopwords.words("english")
sw.extend(['neighborhoods','neighborhood','one','mi', 'etc','take', 'ave','attractions',
           'walk','away','blocks','block','area','apartment','around','minutes','public',
           'many','within','minute','departure','get','like','even','access',"you're",'two',
           'street','streets','right','best','distance','located','also','building','close',
           'avenue','near','please','hidden','number','could','go','lot','set','upon',
           'plenty','th', 'location', 'st', 'airbnb', 'far', 'min', 'mins', 'everything',
           'along','dozen','would','return','several','atmosphere','https','www','com',
           'lots', 'nearby', 'find', 'it’s', 'place', 'lined','predominantly','easy',
           'highlights','blvd','website','still','offer','always','end','great','downtown',
           'whole','foods','trader','joes','across','including','need','day','full','stop',
           'much','feel','tons','next','see','part','want','hour','less','major','top',
           'offers','areas','really','years','spot','make','way','coming','check','things',
           'nd','starbucks','back','surrounded','main','yet','excellent','quite','come',
          'michigan','mile','wrigley','chicago’s','bucktown','wrigleyville','andersonville',
          'magnificent','pilsen','cubs','mccormick','milwaukee','boystown','grant','clark',
          'you’ll','via','—','half','illinois','walgreens','shedd','damen','second',
          'largest','roscoe','numerous','situated','addison','hare','filled','you’re',
          'surrounding','features','include','oak','better','fulton','willis','called',
          'something','l','enough','experience','bean','destination','three','row',
          'anywhere','may','bronzeville','rental','considered','late','el','available',
          'amenities','directly','loyola','living','property','play','five','edge',
          'hear','name','includes','randolph','halsted','we’re','head','reviews','depaul',
          'unit','year','recently','convention','offering','hall','read','wells','staying',
          'addition','miss','road','montrose','eastern','avenues','alike','makes','spend',
          'know','lived','takes','sought','try','line','well','miles','ride','village',
           'corner','know','options','united','spots','visit','uber','buildings','front',
          'stay','door','middle','time','us','owned','countless','run','country','southport',
          'stops','proximity','class','national','stadium','target','lines','fine','become',
          'course','plus','live','every','nice','navy','pier','millenium','looking','high',
          'northwest','spaces','long','condo','worlds','city’s','morning','state','four',
          'named','sure','hub','la','uic','tour'])
sw.extend(neighborhood_words)

In [55]:
with open("stopwords.txt", "wb") as fp: 
      pickle.dump(sw, fp)

# Generate Neighborhood Datasets
Features are words that appear in >1% of all entries

In [56]:
def clean(text):
  for c in ",.!-/:·;()*&[]_—~`\"+=\|'?<>^%$#@•–0123456789":
    text = text.replace(c, " ")
  return text.lower()

In [57]:
all_words = clean(df.neighborhood_overview.str.cat(sep=' '))
all_top = pd.DataFrame(Counter(all_words.split()).most_common(), columns=['word','freq'])
all_top = all_top[~all_top.word.isin(sw)]

In [58]:
threshold = len(df.neighborhood_overview)*0.01

top_words = all_top[all_top.freq > threshold].word.tolist()
print(top_words)
print(len(top_words))

['restaurants', 'bars', 'walking', 'shops', 'coffee', 'shopping', 'field', 'home', 'quiet', 'train', 'local', 'art', 'wicker', 'food', 'museum', 'historic', 'grocery', 'river', 'bar', 'stores', 'beautiful', 'transportation', 'music', 'safe', 'short', 'bus', 'parks', 'district', 'house', 'community', 'university', 'nightlife', 'old', 'dining', 'beach', 'drive', 'night', 'lakeview', 'diverse', 'heart', 'enjoy', 'popular', 'restaurant', 'market', 'known', 'store', 'friendly', 'entertainment', 'tree', 'theater', 'blue', 'residential', 'little', 'boutiques', 'famous', 'family', 'bike', 'venues', 'galleries', 'vibrant', 'places', 'mexican', 'green', 'lakefront', 'steps', 'station', 'love', 'people', 'cafes', 'amazing', 'trendy', 'free', 'trail', 'scene', 'museums', 'history', 'red', 'shop', 'parking', 'cultural', 'campus', 'zoo', 'pizza', 'cafe', 'fun', 'culture', 'space', 'unique', 'homes', 'views', 'good', 'small', 'young', 'lively', 'quick', 'cta', 'world', 'mix', 'favorite', 'ukrainian',

In [68]:
nb_maps = {}
for nb in df['neighbourhood_cleansed'].unique():
  words = df[df['neighbourhood_cleansed']==nb].dropna(subset=['neighborhood_overview'])
  words_str = clean(words.neighborhood_overview.str.cat(sep=' '))
  words_top = pd.DataFrame(Counter(words_str.split()).most_common(), columns=['word','freq'])
  words_cleaned = words_top[~words_top.word.isin(sw)]
  words_cleaned['freq_norm'] = words_cleaned.freq/len(words)
  print(nb,words_cleaned.word[:10].tolist())
  nb_maps[nb] = words_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Hyde Park ['university', 'museum', 'home', 'walking', 'restaurants', 'shopping', 'historic', 'science', 'industry', 'museums']
South Lawndale ['little', 'mexican', 'restaurants', 'brewery', 'shops', 'amazing', 'people', 'local', 'train', 'mexico']
West Town ['wicker', 'restaurants', 'bars', 'shops', 'coffee', 'ukrainian', 'walking', 'art', 'shopping', 'division']
Lincoln Park ['restaurants', 'zoo', 'shops', 'tree', 'old', 'bars', 'walking', 'historic', 'beach', 'shopping']
Logan Square ['restaurants', 'bars', 'coffee', 'local', 'shops', 'wicker', 'walking', 'trail', 'food', 'market']
Uptown ['restaurants', 'shops', 'coffee', 'green', 'walking', 'bars', 'mill', 'riviera', 'aragon', 'home']
Near North Side ['river', 'restaurants', 'shopping', 'coast', 'gold', 'bars', 'old', 'shops', 'walking', 'art']
Avondale ['restaurants', 'bars', 'blue', 'walking', 'food', 'bar', 'home', 'train', 'short', 'local']
Lake View ['lakeview', 'restaurants', 'field', 'bars', 'shops', 'walking', 'home', 'game

Chicago Lawn ['de', 'un', 'friendly', 'children', 'millas', 'del', 'aeropuerto', 'internacional', 'midway', 'cerca']
Mount Greenwood ['night', 'food', 'restaurants', 'nightlife', 'bars', 'transportation', 'parks', 'jewel', 'isco', 'cvs']
Riverdale ['suburbs', 'train', 'bus', 'combo', 'recommended', 'red', 'connect', 'redline', 'hr', 'quiet']


In [69]:
def get_freq(word):
  test = pd.DataFrame(columns=['neighborhood','freq','word'])
  for key in nb_maps.keys():
    words = nb_maps[key]
    if len(words.loc[words['word']==word]['freq_norm'])>0:
      test = test.append({'word':word,
                          'neighborhood':key,
                          'freq':words.loc[words['word']==word]['freq_norm'].item()
                         },ignore_index=True)
  return test.sort_values(by='freq',ascending=False)

In [70]:
all_nbs = list(nb_maps.keys())
clrs = sns.color_palette('hls', n_colors=len(all_nbs)) 
colors = {}
for i in range(len(all_nbs)):
  colors[all_nbs[i]] = clrs[i]

In [71]:
all_words = pd.DataFrame(columns=['neighborhood','freq','word'])
for word in top_words:
  freqs = get_freq(word)
#   freqs.plot(x='neighborhood',y='freq',title=word,kind='bar',legend=False,
#              color=freqs['neighborhood'].apply(lambda x: colors[x]))
#   plt.savefig('images/barchart_'+word+'.png',bbox_inches = "tight")
  all_words = all_words.append(freqs)

In [72]:
neighborhood_words = pd.pivot_table(all_words,values=['freq'],index='neighborhood',columns=['word'],fill_value=0).reset_index()
neighborhood_words.columns = neighborhood_words.columns.droplevel()
neighborhood_words.rename(columns={'':'neighborhood_id'},inplace=True)
neighborhood_words.head()

word,neighborhood_id,accessible,activities,african,airport,amazing,american,aquarium,architecture,art,...,white,wicker,wine,winning,wonderful,work,working,world,young,zoo
0,Albany Park,0.042857,0.0,0.014286,0.071429,0.057143,0.057143,0.0,0.014286,0.014286,...,0.014286,0.014286,0.071429,0.0,0.0,0.0,0.0,0.071429,0.014286,0.0
1,Archer Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Armour Square,0.017241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103448,...,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.051724,0.0,0.0
3,Ashburn,0.0,0.0,0.0,0.555556,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0
4,Auburn Gresham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0


# Merge Features

In [73]:
merged = pd.merge(neighborhood_words,neighborhood_quant)
merged.head()

Unnamed: 0,neighborhood_id,accessible,activities,african,airport,amazing,american,aquarium,architecture,art,...,wine,winning,wonderful,work,working,world,young,zoo,num_listings,price_norm
0,Albany Park,0.042857,0.0,0.014286,0.071429,0.057143,0.057143,0.0,0.014286,0.014286,...,0.071429,0.0,0.0,0.0,0.0,0.071429,0.014286,0.0,94,28.490166
1,Archer Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,50.786111
2,Armour Square,0.017241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103448,...,0.0,0.0,0.0,0.0,0.0,0.051724,0.0,0.0,97,28.169723
3,Ashburn,0.0,0.0,0.0,0.555556,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,11,20.187879
4,Auburn Gresham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,15,25.796667


In [74]:
merged.columns

Index(['neighborhood_id', 'accessible', 'activities', 'african', 'airport',
       'amazing', 'american', 'aquarium', 'architecture', 'art',
       ...
       'wine', 'winning', 'wonderful', 'work', 'working', 'world', 'young',
       'zoo', 'num_listings', 'price_norm'],
      dtype='object', length=290)

In [75]:
merged.to_csv('cleaned_data/airbnb.csv',index=False)