# Preprocessing Notebooke

Preprocess each of the following files and places them in the processed folder within the data directory.
- Calendar
- Reviews
- Listings
- Neighborhoods

In [1]:
# Load libraries
import numpy as np
import pandas as pd
import time
import os
import sys
import ast
import json
import datetime as dt
from datetime import datetime
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer

# Unlimited columns
pd.options.display.max_columns = None

## Load Data

In [2]:
# Load calendar data
calendar = pd.read_csv("../data/raw/calendar.csv")

# Load reviews data
review_map = pd.read_csv("../data/raw/reviews_ids.csv")
reviews = pd.read_csv("../data/raw/reviews.csv")

# Load listings data
listings_map = pd.read_csv("../data/raw/listings_ids.csv")
listings = pd.read_csv("../data/raw/listings.csv")

# Load neighborhoods
neighborhoods = pd.read_csv("../data/raw/neighbourhoods.csv")

In [3]:
# Set output path
out_path = "../data/processed/"

In [4]:
# Make processed folder
#if ~os.direxists("../data/processed"):
#    os.mkdir("../data/processed")
try:
    os.mkdir("../data/processed")
except:
    print("Processed directory exists")

Processed directory exists


In [5]:
# Functions
# Description:  Turn into BOW, and keep top n words OHE
def preprocess_text(text):
    """
    
    """
    
    stop_words = list(set(stopwords.words('english')))
    punctuation = string.punctuation
    stop_words += list(punctuation)
    stop_words.extend(['``','’', '`','br','"',"”", "''", "'s", "/b"]) 
    text = text.replace(r'<br />',' ')
    preprocessed = []    
        
    # Tokenization using nltk word tokenization
    tokenized = word_tokenize(text)
    for token in tokenized:
        token = token.lower()
        if token not in stop_words and len(token) > 1:
            preprocessed.append(token)
    return " ".join(preprocessed)

### Calendar

In [6]:
# Set output paths
cal_path = out_path + "calendar.csv"

In [7]:
# Remove dollar signs
calendar[calendar.columns[3]] = calendar[calendar.columns[3]].replace('[\$,]', '', regex=True).astype(float)
calendar[calendar.columns[4]] = calendar[calendar.columns[4]].replace('[\$,]', '', regex=True).astype(float)

In [8]:
# Convert date string to date object
calendar['date'] = pd.to_datetime(calendar.date)

In [9]:
# Convert 't', and 'f' to 1 and 0
calendar['available'] = calendar['available'].replace('t', '1')
calendar['available'] = calendar['available'].replace('f', '0')

# Conver to numeric
calendar['available'] = calendar['available'].apply(pd.to_numeric)

In [10]:
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2438590 entries, 0 to 2438589
Data columns (total 7 columns):
 #   Column          Dtype         
---  ------          -----         
 0   listing_id      int64         
 1   date            datetime64[ns]
 2   available       int64         
 3   price           float64       
 4   adjusted_price  float64       
 5   minimum_nights  float64       
 6   maximum_nights  float64       
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 130.2 MB


In [11]:
calendar.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,130805,2021-04-08,0,97.0,97.0,30.0,90.0
1,12042,2021-04-08,1,65.0,65.0,32.0,90.0
2,958,2021-04-10,0,150.0,150.0,2.0,1125.0
3,958,2021-04-11,0,150.0,150.0,2.0,1125.0
4,958,2021-04-12,1,151.0,151.0,2.0,1125.0


In [12]:
calendar.to_csv(cal_path)

### Reviews

In [13]:
# Set output paths
rev_path = out_path + "reviews.csv"

In [14]:
# Convert date string to date object
reviews['date'] = pd.to_datetime(reviews.date)

In [15]:
# Convert int ids to strings
reviews['listing_id'] = reviews.listing_id.astype(str)
reviews['id'] = reviews.listing_id.astype(str)
reviews['reviewer_id'] = reviews.listing_id.astype(str)

In [16]:
# Turn nan values into empty strings
reviews['comments'] = reviews.comments.replace(np.nan, "")

# preprocess text
reviews['comments'] = reviews.comments.apply(prep , rocess_text)

In [17]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,958,958,2009-07-23,958,Edmund C,experience without doubt five star experience ...
1,958,958,2009-08-03,958,Simon,returning san francisco rejuvenating thrill ti...
2,958,958,2009-09-27,958,Denis,pleased accommodations friendly neighborhood a...
3,958,958,2009-11-05,958,Anna,highly recommend accomodation agree previous p...
4,958,958,2010-02-13,958,Venetia,holly place great exactly needed perfect locat...


In [18]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279937 entries, 0 to 279936
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   listing_id     279937 non-null  object        
 1   id             279937 non-null  object        
 2   date           279937 non-null  datetime64[ns]
 3   reviewer_id    279937 non-null  object        
 4   reviewer_name  279937 non-null  object        
 5   comments       279937 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 12.8+ MB


In [19]:
reviews.to_csv(rev_path)

### Listings

In [20]:
# Set output paths
listings_path = out_path + "listings.csv"

In [21]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6682 entries, 0 to 6681
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            6682 non-null   int64  
 1   listing_url                                   6682 non-null   object 
 2   scrape_id                                     6682 non-null   int64  
 3   last_scraped                                  6682 non-null   object 
 4   name                                          6682 non-null   object 
 5   description                                   6609 non-null   object 
 6   neighborhood_overview                         4777 non-null   object 
 7   picture_url                                   6682 non-null   object 
 8   host_id                                       6682 non-null   int64  
 9   host_url                                      6682 non-null   o

#### What to do with columns
- id - keep for merging
- listing_url - drop
- scrape_id - drop
- last_scraped - drop
- name of listing - drop
- description - OHE, keep top n words
- neighborhood_overview - drop
- picture_url - drop
- host_id - keep for merging
- host_url - drop
- host_name - drop
- host_since - convert to date
- host_location - Check if SF, turn into Boolean
- host_about - drop
- host_response_time - Leave as string
- host_response_rate - convert string percentage to float
- host_acceptance_rate - convert string percentage to float
- host_is_superhost - Keep as bool
- host_thumbnail_url - drop
- host_neighborhood - Leave as string
- host_listings_count - Keep as float
- host_total_listings_count - drop
- host_verifications - Unest list, and then OHE
- host_has_profile_pic - Keep as boolean
- host_identity_verified - Keep as boolean
- neighborhood - drop
- neighborhood_cleansed - keep as string
- neighborhood_group_cleaned - drop
- latitude - Keep as float
- longitude - keep as float
- property_type - keep as string
- room_type - keep as string
- accommodates - keep as int
- bathrooms - drop
- bathrooms_text - split into OHE "bathroom_type" for private or shared.  Add "bathroom_num" as - number of bathrooms.  (interaction?  Maybe just OHE this?)
- bedrooms - Keep as float
- beds - Keep as float
- amenities - Unnest and OHE
- price - convert string currency to float
- minimum_nights - Keep as integer
- maximum_nights - Keep as integer
- minimum_minimum_nights - Keep as integer
- maximum_minimum_nights - Keep as integer
- minimum_maximum_nights - Keep as integer
- maximum_maximum_nights - Keep as integer
- minimum_nights_avg_ntm - Keep as integer
- maximum_nights_avg_ntm - Keep as integer
- calendar_updated - drop, empty
- has_availability - keep as boolean
- availability_30 - Keep as integer
- availability_60 - Keep as integer
- availability_90 - Keep as integer
- availability_365 - Keep as integer
- calendar_last_scraped - drop
- number_of_reviews - Keep as integer
- number_of_reviews_ltm - Keep as integer
- number_of_reviews_l30d - Keep as integer
- first_review - Drop, add new feature of days between last and first review
- last_review - Drop, add new feature of days between present and last review
- review_scores_rating - Keep as float
- review_scores_accuracy - Keep as float
- review_scores_cleanliness - Keep as float
- review_scores_checkin - Keep as float
- review_scores_communication - Keep as float
- review_scores_location - Keep as float
- review_scores_value - Keep as float
- license - convert to exists, non-exists bool
- instant_bookable - Keep as bool
- calculated_host_listings_count - Keep as int
- calculated_host_listings_count_entire_homes - Keep as int
- calculated_host_listings_count_private_rooms - Keep as int
- calculated_host_listings_count_shared_rooms - Keep as int
- reviews_per_month - Keep as float

In [65]:
# Load listings data
listings_map = pd.read_csv("../data/raw/listings_ids.csv")
listings = pd.read_csv("../data/raw/listings.csv")

In [66]:
# Drop unneeded columns
drop_cols = ['host_picture_url', 'listing_url', 'scrape_id', 'last_scraped', 'name',
             'neighborhood_overview', 'picture_url', 'host_url', 
             'host_name', 'host_about', 'host_thumbnail_url',
             'neighbourhood', 'bathrooms', 'calendar_updated', 
             'calendar_last_scraped']

listings = listings.drop(columns=drop_cols, axis=0)

In [67]:
# Define date columns
date_cols = ['host_since', 'first_review', 'last_review']

# Cast date columns to date
listings[date_cols] = listings[date_cols].apply(pd.to_datetime)

In [68]:
# Define Integer variables
int_cols = ['host_listings_count', 'accommodates', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
            'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights',
            'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30',
            'availability_60', 'availability_90', 'availability_365', 'number_of_reviews',
            'number_of_reviews_ltm', 'number_of_reviews_l30d', 'calculated_host_listings_count',
            'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
            'calculated_host_listings_count_shared_rooms', 'neighbourhood_group_cleansed']

# cast int columns to integer
#listings[int_cols] = listings[int_cols].apply(pd.to_numeric, args=('integer'))

for col in int_cols:
    listings[col] = pd.to_numeric(listings[col], downcast='integer')

In [69]:
# Define float columns
float_cols = ['latitude', 'longitude', 'bedrooms', 'beds', 'review_scores_rating',
              'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
              'review_scores_communication', 'review_scores_location', 'review_scores_value',
              'reviews_per_month']

# cast float columns to integer
listings[float_cols] = listings[float_cols].apply(pd.to_numeric)

In [70]:
# Define boolean columns
bool_cols = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 
             'has_availability', 'instant_bookable']

# Convert 't', and 'f' to 1 and 0
listings[bool_cols] = listings[bool_cols].replace('t', '1')
listings[bool_cols] = listings[bool_cols].replace('f', '0')

# Conver to numeric
listings[bool_cols] = listings[bool_cols].apply(pd.to_numeric)

In [71]:
# Turn nan values into empty strings
listings['description'] = listings.description.replace(np.nan, "")

# preprocess text
listings['description'] = listings.description.apply(preprocess_text)

# Create count vectorizer object.  Max features can be changed
desc_n = 15
vec = CountVectorizer(binary=False, max_features=desc_n)

# Fit count vectorizer
X_counts = vec.fit_transform(listings.description)

# Create new features
bow_df = pd.DataFrame(X_counts.toarray(), columns=["desc_" + str(col) for col in list(vec.vocabulary_.keys())], index=listings.index)

# Add them onto the original dataframe
listings = listings.join(bow_df)

# Drop description variable
listings.drop(labels=['description'], axis=1, inplace=True)

# host_location: Convert to 1 if in SF, 0 else
city = "San Francisco, California, United States"
listings = listings.assign(host_in_sf = (listings.host_location == city).astype(int))

# Drop host location variable
listings.drop(labels=['host_location'], axis=1, inplace=True)

In [72]:
# host_response_rate and host_acceptance_rate - convert string percentage to float
listings['host_response_rate'] = listings['host_response_rate'].str.rstrip('%').astype('float') / 100.0
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].str.rstrip('%').astype('float') / 100.0

In [73]:
listings.head()

Unnamed: 0,id,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,desc_apartment,desc_located,desc_space,desc_home,desc_bed,desc_room,desc_kitchen,desc_access,desc_one,desc_private,desc_san,desc_francisco,desc_bathroom,desc_bedroom,desc_living,host_in_sf
0,958,1169,2008-07-31,within an hour,1.0,0.91,1.0,Duboce Triangle,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'kba']",1.0,1.0,Western Addition,,37.77028,-122.43317,Entire apartment,Entire home/apt,3,1 bath,1.0,2.0,"[""Keypad"", ""Backyard"", ""Hot water"", ""Long term...",$150.00,2,30,2.0,2.0,1125.0,1125.0,2.0,1125.0,1,4,6,10,112,277,36,1,2009-07-23,2021-04-05,97.0,10.0,10.0,10.0,10.0,10.0,10.0,STR-0001256,0,1,1,0,0,1.94,2,2,0,2,0,0,1,1,0,1,0,0,2,0,1,1
1,5858,8904,NaT,,,,,,,,,,,Bernal Heights,,37.74474,-122.42089,Entire apartment,Entire home/apt,5,1 bath,2.0,3.0,"[""Hangers"", ""Long term stays allowed"", ""Washer...",$195.00,30,60,30.0,30.0,60.0,60.0,30.0,60.0,1,30,60,90,365,111,0,0,2009-05-03,2017-08-06,98.0,10.0,10.0,10.0,10.0,10.0,9.0,,0,1,1,0,0,0.76,1,0,0,0,0,0,0,2,0,0,1,0,1,0,1,0
2,7918,21994,2009-06-17,,,,0.0,Cole Valley,10.0,10.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",1.0,1.0,Haight Ashbury,,37.76555,-122.45213,Private room in apartment,Private room,2,4 shared baths,1.0,1.0,"[""Hangers"", ""Long term stays allowed"", ""Washer...",$56.00,32,60,32.0,32.0,60.0,60.0,32.0,60.0,1,30,60,90,365,19,0,0,2009-08-31,2020-03-06,84.0,7.0,8.0,9.0,9.0,9.0,8.0,,0,8,0,8,0,0.13,0,1,0,1,0,0,0,1,0,1,0,3,6,0,1,1
3,8142,21994,2009-06-17,,,,0.0,Cole Valley,10.0,10.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",1.0,1.0,Haight Ashbury,,37.76555,-122.45213,Private room in apartment,Private room,2,4 shared baths,1.0,1.0,"[""Long term stays allowed"", ""Washer"", ""Host gr...",$56.00,32,90,32.0,32.0,90.0,90.0,32.0,90.0,1,30,60,90,365,8,0,0,2014-09-08,2018-09-12,93.0,9.0,9.0,10.0,10.0,9.0,9.0,,0,8,0,8,0,0.1,0,1,0,1,0,0,0,1,0,1,0,3,6,0,1,1
4,8339,24215,2009-07-02,within a few hours,1.0,0.0,0.0,Alamo Square,2.0,2.0,"['email', 'phone', 'reviews', 'kba']",1.0,1.0,Western Addition,,37.77564,-122.43642,Entire condominium,Entire home/apt,4,1.5 baths,2.0,2.0,"[""Hangers"", ""Dishwasher"", ""Kitchen"", ""Carbon m...",$795.00,7,111,7.0,7.0,111.0,111.0,7.0,111.0,1,29,59,89,364,28,0,0,2009-09-25,2019-06-28,97.0,10.0,10.0,10.0,10.0,10.0,10.0,STR-0000264,0,2,2,0,0,0.2,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,1


In [74]:
# Host verifications
# Get list of all host verifications
verifications_list = list(listings.host_verifications)
verifications_list = [ast.literal_eval(vers) if vers != 'None' else ['None'] for vers in verifications_list]

# Reassign non-string column
listings['host_verifications'] = verifications_list

# Utilize sklearn.preprocessing.MultiLabelBinarizer for sparse OHE of lists
mlb = MultiLabelBinarizer(sparse_output=True)

listings = listings.join(
           pd.DataFrame.sparse.from_spmatrix(
               mlb.fit_transform(listings.pop('host_verifications')),
               index=listings.index,
               columns="host_verifications_" + mlb.classes_))

In [75]:
# Bathrooms text
# Normalize text
listings.bathrooms_text = listings.bathrooms_text.str.lower()

# Give bathrooms with no number a 1, as is implied
listings.bathrooms_text = listings.bathrooms_text.str.replace("^shared half-bath", "1 shared half-bath", regex=True)
listings.bathrooms_text = listings.bathrooms_text.str.replace("^private half-bath", "1 private half-bath", regex=True)
listings.bathrooms_text = listings.bathrooms_text.str.replace("^half-bath", "1 half-bath", regex=True)

# Create column for private
listings['bathroom_private'] = pd.to_numeric(listings.bathrooms_text.str.contains(r'private'), errors='coerce').astype('Int8')

# Create column for shared
listings['bathroom_shared'] = pd.to_numeric(listings.bathrooms_text.str.contains(r'shared'), errors='coerce').astype('Int8')

# Create column for half baths
listings['bathroom_half'] = pd.to_numeric(listings.bathrooms_text.str.contains(r'half'), errors='coerce').astype('Int8')

# Create column for numeric bathroom count
listings['bathroom_count'] = listings.bathrooms_text.str.split(" ").str[0].astype(float)

# Drop the bathrooms_text feature
listings.drop(labels=['bathrooms_text'], axis=1, inplace=True)

In [76]:
# Amenities
# Get list of all host amenities
amenities_list = list(listings.amenities)
amenities_list = [ast.literal_eval(vers) if vers != 'None' else ['None'] for vers in amenities_list]

# Reassign non-string column
listings['amenities'] = amenities_list

# Utilize sklearn.preprocessing.MultiLabelBinarizer for sparse OHE of lists
mlb = MultiLabelBinarizer(sparse_output=True)

all_amenities = pd.DataFrame.sparse.from_spmatrix(
               mlb.fit_transform(listings.pop('amenities')),
               index=listings.index,
               columns="amenities_" + mlb.classes_)

n_amenities = 30
amenity_cols = list(pd.DataFrame(all_amenities.filter(regex=("^amenities")).sum(), columns=['sum']).sort_values('sum', ascending=False).head(n_amenities).index)

listings = listings.join(
           all_amenities[amenity_cols])

In [77]:
# Format price variables
listings['price'] = listings['price'].replace('[\$,]', '', regex=True).astype(float)

In [78]:
# Deal with review date features
# Convert to date objects
listings['first_review'] = pd.to_datetime(listings.first_review)
listings['last_review'] = pd.to_datetime(listings.last_review)

# Add feature for time span between first and last reviews
listings['review_span'] = (listings.last_review - listings.first_review).dt.days

# Add feature for time since today and last review
today = dt.datetime.today().strftime("%Y-%m-%d")
listings['t_since_last_review'] = (datetime.strptime(today, '%Y-%m-%d') - listings.last_review).dt.days

# Add feature for days as host
listings['t_as_host'] = (datetime.strptime(today, '%Y-%m-%d') - listings.host_since).dt.days

# Drop first and last review columns (First review likely unhelpful since we know when the host started)
listings.drop(['first_review', 'last_review', 'host_since'], axis=1, inplace=True)

In [79]:
# Add new license feature (over 1700 different licenses for 6000 licenses.  Just keeping as an 'if_exists' feature)
listings['has_license'] = (~listings.license.isnull()).astype(int)
listings.drop(['license'], axis=1, inplace=True)

In [80]:
# host_response_time
listings.host_response_time.value_counts()

# Keep as is

within an hour        2991
within a few hours    1054
within a day           677
a few days or more     155
Name: host_response_time, dtype: int64

In [81]:
# Host neighborhood
# Get the host neighbourhood value counts
host_neighbourhoods = pd.DataFrame(listings.host_neighbourhood.value_counts())

# Get host neighbourhoods proportions
host_neighbourhoods_prop = pd.DataFrame(host_neighbourhoods.host_neighbourhood / host_neighbourhoods.host_neighbourhood.sum())

# Get the top neighbourhoods
other_cutoff = 0.02
top_neighbourhoods = list(host_neighbourhoods_prop.query("host_neighbourhood > @other_cutoff").index) + [np.nan]

# Add other bucket
listings['host_neighbourhood'] = [n if n in top_neighbourhoods else "Other" for n in listings.host_neighbourhood]

In [82]:
# neighbourhood cleansed
listings.neighbourhood_cleansed.value_counts()

Downtown/Civic Center    725
Mission                  592
South of Market          518
Western Addition         462
Haight Ashbury           333
Castro/Upper Market      311
Bernal Heights           298
Nob Hill                 286
Noe Valley               258
Outer Sunset             253
North Beach              194
Russian Hill             192
Financial District       189
Inner Richmond           177
Potrero Hill             177
Bayview                  164
Marina                   155
Pacific Heights          145
Chinatown                145
Outer Richmond           145
Excelsior                132
Inner Sunset             127
Outer Mission            122
West of Twin Peaks       101
Parkside                  97
Ocean View                96
Glen Park                 55
Twin Peaks                52
Visitacion Valley         48
Lakeshore                 41
Crocker Amazon            30
Presidio Heights          23
Seacliff                  14
Diamond Heights           12
Presidio      

In [83]:
# Property type
top_properties = list(pd.DataFrame(listings.property_type.value_counts()).query("property_type > 50").T.columns)
listings['property_type'] = [n if n in top_properties else "Other" for n in listings.property_type]

In [84]:
# Room type
listings.room_type.value_counts()

# leave as is

Entire home/apt    4145
Private room       2264
Shared room         178
Hotel room           95
Name: room_type, dtype: int64

In [85]:
# Drop straggler columns
stragglers = ['description', 'bathrooms_text']

In [86]:
listings.head()

Unnamed: 0,id,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,desc_apartment,desc_located,desc_space,desc_home,desc_bed,desc_room,desc_kitchen,desc_access,desc_one,desc_private,desc_san,desc_francisco,desc_bathroom,desc_bedroom,desc_living,host_in_sf,host_verifications_None,host_verifications_email,host_verifications_facebook,host_verifications_google,host_verifications_government_id,host_verifications_identity_manual,host_verifications_jumio,host_verifications_kba,host_verifications_manual_offline,host_verifications_manual_online,host_verifications_offline_government_id,host_verifications_phone,host_verifications_reviews,host_verifications_selfie,host_verifications_sent_id,host_verifications_work_email,host_verifications_zhima_selfie,bathroom_private,bathroom_shared,bathroom_half,bathroom_count,amenities_Wifi,amenities_Smoke alarm,amenities_Essentials,amenities_Heating,amenities_Hangers,amenities_Carbon monoxide alarm,amenities_Hair dryer,amenities_Long term stays allowed,amenities_Iron,amenities_TV,amenities_Kitchen,amenities_Shampoo,amenities_Dedicated workspace,amenities_Hot water,amenities_Washer,amenities_Dryer,amenities_Fire extinguisher,amenities_Refrigerator,amenities_Coffee maker,amenities_Microwave,amenities_Dishes and silverware,amenities_Bed linens,amenities_First aid kit,amenities_Cooking basics,amenities_Private entrance,amenities_Oven,amenities_Free street parking,amenities_Stove,amenities_Dishwasher,amenities_Extra pillows and blankets,review_span,t_since_last_review,t_as_host,has_license
0,958,1169,within an hour,1.0,0.91,1.0,Other,1.0,1.0,1.0,1.0,Western Addition,,37.77028,-122.43317,Entire apartment,Entire home/apt,3,1.0,2.0,150.0,2,30,2.0,2.0,1125.0,1125.0,2.0,1125.0,1,4,6,10,112,277,36,1,97.0,10.0,10.0,10.0,10.0,10.0,10.0,0,1,1,0,0,1.94,2,2,0,2,0,0,1,1,0,1,0,0,2,0,1,1,0,1,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,0,0,4274.0,122.0,4753.0,1
1,5858,8904,,,,,,,,,,Bernal Heights,,37.74474,-122.42089,Entire apartment,Entire home/apt,5,2.0,3.0,195.0,30,60,30.0,30.0,60.0,60.0,30.0,60.0,1,30,60,90,365,111,0,0,98.0,10.0,10.0,10.0,10.0,10.0,9.0,0,1,1,0,0,0.76,1,0,0,0,0,0,0,2,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1,1,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,3017.0,1460.0,,0
2,7918,21994,,,,0.0,Other,10.0,10.0,1.0,1.0,Haight Ashbury,,37.76555,-122.45213,Private room in apartment,Private room,2,1.0,1.0,56.0,32,60,32.0,32.0,60.0,60.0,32.0,60.0,1,30,60,90,365,19,0,0,84.0,7.0,8.0,9.0,9.0,9.0,8.0,0,8,0,8,0,0.13,0,1,0,1,0,0,0,1,0,1,0,3,6,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,4.0,1,1,0,1,1,1,0,1,0,1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,3840.0,517.0,4432.0,0
3,8142,21994,,,,0.0,Other,10.0,10.0,1.0,1.0,Haight Ashbury,,37.76555,-122.45213,Private room in apartment,Private room,2,1.0,1.0,56.0,32,90,32.0,32.0,90.0,90.0,32.0,90.0,1,30,60,90,365,8,0,0,93.0,9.0,9.0,10.0,10.0,9.0,9.0,0,8,0,8,0,0.1,0,1,0,1,0,0,0,1,0,1,0,3,6,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,4.0,1,1,0,1,0,1,0,1,0,1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1465.0,1058.0,4432.0,0
4,8339,24215,within a few hours,1.0,0.0,0.0,Other,2.0,2.0,1.0,1.0,Western Addition,,37.77564,-122.43642,Entire condominium,Entire home/apt,4,2.0,2.0,795.0,7,111,7.0,7.0,111.0,111.0,7.0,111.0,1,29,59,89,364,28,0,0,97.0,10.0,10.0,10.0,10.0,10.0,10.0,0,2,2,0,0,0.2,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1.5,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,3563.0,769.0,4417.0,1


In [87]:
listings.to_csv(listings_path)

### Neighborhoods

In [88]:
# Set output paths
neighbourhoods_path = out_path + "neighbourhoods.csv"
neighborhoods.to_csv(neighbourhoods_path)