In [2]:
import numpy as np
import pandas as pd
import datetime as dt

### Read in Restaurant data pickle

In [33]:
restaurants = pd.read_pickle('./restaurants_df.pkl')

In [34]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3231 entries, 15 to 249954
Data columns (total 10 columns):
DBA Name               3231 non-null object
Street Address         3231 non-null object
Source Zipcode         3231 non-null float64
Business Start Date    3231 non-null datetime64[ns]
Business End Date      782 non-null datetime64[ns]
Years Open             3231 non-null float64
Year Opened            3231 non-null int64
Month Opened           3231 non-null int64
Year Closed            782 non-null float64
Success                3231 non-null int64
dtypes: datetime64[ns](2), float64(3), int64(3), object(2)
memory usage: 277.7+ KB


### Neighborhoods (labels match to zipcodes)

In [35]:
# Read in 'Neighborhoods' pickle
neighborhoods = pd.read_pickle('./zipcode_neighborhood_table.pkl')

In [36]:
neighborhoods

Unnamed: 0,Zip Code,Neighborhood
0,94102,Hayes Valley/Tenderloin/North of Market
1,94103,South of Market
2,94107,Potrero Hill
3,94108,Chinatown
4,94109,Polk/Russian Hill (Nob Hill)
5,94110,Inner Mission/Bernal Heights
6,94112,Ingelside-Excelsior/Crocker-Amazon
7,94114,Castro/Noe Valley
8,94115,Western Addition/Japantown
9,94116,Parkside/Forest Hill


In [37]:
#Merge with restaurants table --> Lost some data points, maybe can work on that in a later iteration
restaurants = pd.merge(restaurants, neighborhoods, left_on=['Source Zipcode'], right_on=['Zip Code'])

In [38]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2856 entries, 0 to 2855
Data columns (total 12 columns):
DBA Name               2856 non-null object
Street Address         2856 non-null object
Source Zipcode         2856 non-null float64
Business Start Date    2856 non-null datetime64[ns]
Business End Date      704 non-null datetime64[ns]
Years Open             2856 non-null float64
Year Opened            2856 non-null int64
Month Opened           2856 non-null int64
Year Closed            704 non-null float64
Success                2856 non-null int64
Zip Code               2856 non-null int64
Neighborhood           2856 non-null object
dtypes: datetime64[ns](2), float64(3), int64(4), object(3)
memory usage: 290.1+ KB


# Food Type --> Google scrape

In [39]:
restaurants.head()

Unnamed: 0,DBA Name,Street Address,Source Zipcode,Business Start Date,Business End Date,Years Open,Year Opened,Month Opened,Year Closed,Success,Zip Code,Neighborhood
0,Neem,2948 Folsom St,94110.0,2016-03-29,2017-05-15,1.128767,2016,3,2017.0,0,94110,Inner Mission/Bernal Heights
1,Our Poke Place,3515 20th St Unit B,94110.0,2017-04-20,2019-09-01,2.367123,2017,4,2019.0,0,94110,Inner Mission/Bernal Heights
2,Ox Cafe,798 Van Ness Ave,94110.0,2014-04-10,NaT,5.671233,2014,4,,1,94110,Inner Mission/Bernal Heights
3,Volcano Kimchi,2948 Folsom St,94110.0,2014-05-20,NaT,5.561644,2014,5,,1,94110,Inner Mission/Bernal Heights
4,Rasoi,2948 Folsom St,94110.0,2014-05-15,2018-03-31,3.879452,2014,5,2018.0,0,94110,Inner Mission/Bernal Heights


In [40]:
google_info = pd.read_pickle('./restaurant_scrape.pkl')

In [41]:
google_info

Unnamed: 0,Business Account Number,DBA Name,Street Address,Source Zipcode,Currently Open,Years Open,Year Opened,Month Opened,Year Closed,Success,Zip Code,Neighborhood,ZHVI_AllHomes,Year,Month,google_results,Rating,Price,Food
3340,1002775,Novy,4000 24 Th St,94114,1.0,4.265753,2015,2,,1,94114.0,Castro/Noe Valley,1463000.0,2015.0,2.0,"(4.2, $$, Greek restaurant)",4.2,2,Greek restaurant
3535,1007372,Liholiho Yacht Club,871 Sutter St,94109,1.0,4.347945,2015,1,,1,94109.0,Polk/Russian Hill (Nob Hill),985900.0,2015.0,1.0,"(4.6, $$$, Californian restaurant)",4.6,3,Californian restaurant
3537,1007392,Hillside Supper Club,300 Precita Ave,94110,1.0,4.347945,2015,1,,1,94110.0,Inner Mission/Bernal Heights,1140000.0,2015.0,1.0,"(4.4, $$, New American restaurant)",4.4,2,New American restaurant
3540,1007473,The Pin Up,772 Folsom St,94108,1.0,4.350685,2015,1,,1,94108.0,Chinatown,852600.0,2015.0,1.0,"(4.0, Diner, 0)",4.0,5,0
3541,1007479,Belle Cora,565 Green St,94133,1.0,4.336986,2015,1,,1,94133.0,North Beach/Chinatown,1189000.0,2015.0,1.0,"(4.6, $$, Bar)",4.6,2,Bar
3542,1007507,Red Tavern Inc,2229 Clement St,94121,1.0,4.336986,2015,1,,1,94121.0,Outer Richmond,1144700.0,2015.0,1.0,"(4.5, $$, Russian restaurant)",4.5,2,Russian restaurant
3544,1007532,Spaghetti Bros.,3213 Scott St,94123,1.0,4.334247,2015,1,,1,94123.0,Marina,1999000.0,2015.0,1.0,"(4.5, $$, New American restaurant)",4.5,2,New American restaurant
3545,1007532,Spaghetti Brothers,3213 Scott St,94123,1.0,4.334247,2015,1,,1,94123.0,Marina,1999000.0,2015.0,1.0,"(4.5, $$, New American restaurant)",4.5,2,New American restaurant
3546,1007538,Myriad,2491 Mission St,94110,1.0,4.334247,2015,1,,1,94110.0,Inner Mission/Bernal Heights,1140000.0,2015.0,1.0,"(4.4, $$, Gastropub)",4.4,2,Gastropub
3549,1007558,Vietnam House,642 Eddy St,94109,1.0,4.331507,2015,1,,1,94109.0,Polk/Russian Hill (Nob Hill),985900.0,2015.0,1.0,"(4.2, $, Vietnamese restaurant)",4.2,1,Vietnamese restaurant


In [42]:
restaurants = pd.merge(restaurants, google_info, on=['DBA Name'])

In [43]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1774 entries, 0 to 1773
Data columns (total 30 columns):
DBA Name                   1774 non-null object
Street Address_x           1774 non-null object
Source Zipcode_x           1774 non-null float64
Business Start Date        1774 non-null datetime64[ns]
Business End Date          398 non-null datetime64[ns]
Years Open_x               1774 non-null float64
Year Opened_x              1774 non-null int64
Month Opened_x             1774 non-null int64
Year Closed_x              398 non-null float64
Success_x                  1774 non-null int64
Zip Code_x                 1774 non-null int64
Neighborhood_x             1774 non-null object
Business Account Number    1774 non-null int64
Street Address_y           1774 non-null object
Source Zipcode_y           1774 non-null int64
Currently Open             1774 non-null float64
Years Open_y               1774 non-null float64
Year Opened_y              1774 non-null int64
Month Opened_y   

In [21]:
# Definitely duplicates now

In [44]:
# Drop redundant columns
restaurants.drop(columns=['Business Account Number', 'Street Address_y', 
                          'Source Zipcode_y', 'Currently Open', 
                          'Years Open_y', 'Year Opened_y', 
                          'Month Opened_y', 'Year Closed_y', 
                          'Success_y', 'Zip Code_y', 
                          'Neighborhood_y', 'ZHVI_AllHomes', 
                          'Year', 'Month'], inplace=True)

In [45]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1774 entries, 0 to 1773
Data columns (total 16 columns):
DBA Name               1774 non-null object
Street Address_x       1774 non-null object
Source Zipcode_x       1774 non-null float64
Business Start Date    1774 non-null datetime64[ns]
Business End Date      398 non-null datetime64[ns]
Years Open_x           1774 non-null float64
Year Opened_x          1774 non-null int64
Month Opened_x         1774 non-null int64
Year Closed_x          398 non-null float64
Success_x              1774 non-null int64
Zip Code_x             1774 non-null int64
Neighborhood_x         1774 non-null object
google_results         1774 non-null object
Rating                 1774 non-null object
Price                  1774 non-null int64
Food                   1774 non-null object
dtypes: datetime64[ns](2), float64(3), int64(5), object(6)
memory usage: 235.6+ KB


## Walk score of location