In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('seaborn-notebook')
%matplotlib inline

In [40]:
df = pd.read_csv('data/ready4model_v2.csv')

In [41]:
df['business_postal_code'].isnull().sum()

159

In [3]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,...,short_inspect_date,short_violation_id,inspect_date,p1_3,p4_6,p7_9,p10_12,p13_18,p19_24,p25_36
0,0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,-122.420455,"(37.789784, -122.420455)",...,20170928,103131,2017-09-28,0,0,5,0,0,6,5


In [4]:
# SF business locations file
df_b = pd.read_csv('data/Registered_Business_Locations_-_San_Francisco.csv')

In [5]:
df_b.columns

Index(['Location Id', 'Business Account Number', 'Ownership Name', 'DBA Name',
       'Street Address', 'City', 'State', 'Source Zipcode',
       'Business Start Date', 'Business End Date', 'Location Start Date',
       'Location End Date', 'Mail Address', 'Mail City', 'Mail Zipcode',
       'Mail State', 'NAICS Code', 'NAICS Code Description', 'Parking Tax',
       'Transient Occupancy Tax', 'LIC Code', 'LIC Code Description',
       'Supervisor District', 'Neighborhoods - Analysis Boundaries',
       'Business Corridor', 'Business Location'],
      dtype='object')

In [6]:
# Source zip code is the correct one for the location of the business, not mail zipcode
df_b[['Street Address', 'Source Zipcode', 'Business Start Date']].head(2)

Unnamed: 0,Street Address,Source Zipcode,Business Start Date
0,550 Gene Friend Way,94158.0,12/01/2012
1,865 Market St #9001,94103.0,12/01/2012


# Identify and import zip code from SF business loc. 
Use address instead of business names, since sometimes there are some weird (not correct ones) names like "1702".

In [7]:
df_b_3cols = df_b[['Street Address', 'Source Zipcode', 'Business Start Date']]
df_b_3cols.head(1)

Unnamed: 0,Street Address,Source Zipcode,Business Start Date
0,550 Gene Friend Way,94158.0,12/01/2012


## let's identify missing zip code from df

In [8]:
df2 = df[['business_postal_code', 'business_address', 'business_name']]

In [9]:
df2.head(1)

Unnamed: 0,business_postal_code,business_address,business_name
0,94109,1412 Polk St,Dar Bar Pakistani/Indian Cusine


In [10]:
df_nozip = df2[df2['business_postal_code'].isnull()]
df_nozip.head(2)

Unnamed: 0,business_postal_code,business_address,business_name
11,,Off The Grid,Don Pablo
21,,Golden Gate Park,Sam's Chowder Mobile


In [11]:
list_nozip_address = df_nozip['business_address'].tolist()

In [12]:
list_nozip_address[:10]

['Off The Grid',
 'Golden Gate Park',
 '1051 Market St',
 '510 Stevenson St',
 '3331 24th St',
 'Off The Grid',
 '6134 Geary Blvd',
 '3055 23rd St',
 '79 Sansome St',
 'Off the Grid']

In [13]:
# let's identify '1051 Market St' zip code in SF business loc

In [14]:
df_b_3cols[df_b_3cols['Street Address'] == '1051 Market St']
# The latest business is the one in operaton currently, obviously

Unnamed: 0,Street Address,Source Zipcode,Business Start Date
65775,1051 Market St,94103.0,02/01/2008
135617,1051 Market St,94103.0,12/17/2014
214983,1051 Market St,94103.0,09/01/2013


In [15]:
# Let's get the list of street address and zipcodes

In [16]:
df_b_matching_ones = df_b_3cols[df_b_3cols['Street Address'].isin(list_nozip_address)]
df_b_matching_ones.head(3)

Unnamed: 0,Street Address,Source Zipcode,Business Start Date
322,1355 Market St,94103.0,12/31/2015
2978,236 Townsend St,94107.0,10/28/2013
4553,428 11th St,94103.0,01/26/2017


In [17]:
df_nozip['business_address'].value_counts()[:7]

Off The Grid                                 37
Off the Grid                                 10
Approved Locations                            4
Approved Private Locations                    3
Justin Herman Plaza                           2
201 2nd St                                    1
Front, between California & Sacramento St     1
Name: business_address, dtype: int64

In [18]:
aaa = len(df_nozip['business_address'])
print('There are {} businesses with no zip codes in SF inspection.'.format(aaa))
print('There are 47 addresses with "Off The Grid" in SF inspection')
print('So we can only save 112 zip codes.')

There are 159 businesses with no zip codes in SF inspection.
There are 47 addresses with "Off The Grid" in SF inspection
So we can only save 112 zip codes.


In [19]:
# Let's read addresses from list_nozip_address list. And read thru df_b_matching_ones for matching addresses.
# Once match address occurs, grap its zip code and paste onto df_update_zipcode.

In [20]:
df_update_zipcode = df[:]

In [21]:
df_update_zipcode.head(3)

Unnamed: 0.1,Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,...,short_inspect_date,short_violation_id,inspect_date,p1_3,p4_6,p7_9,p10_12,p13_18,p19_24,p25_36
0,0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,-122.420455,"(37.789784, -122.420455)",...,20170928,103131,2017-09-28,0,0,5,0,0,6,5
1,1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,-122.419066,"(37.759174, -122.419066)",...,20161206,103157,2016-12-06,0,6,0,0,5,5,3
2,2,79782,Deli 23,2449 23rd St,San Francisco,CA,94110,,,,...,20160503,103120,2016-05-03,4,0,2,0,3,3,2


In [22]:
idx = df_update_zipcode[df_update_zipcode['business_address'] == '2367 MISSION St'].index
df_update_zipcode.loc[idx,'business_postal_code']

1    94110
Name: business_postal_code, dtype: object

In [34]:
for address in list_nozip_address:
    for row in df_b_matching_ones.iterrows():
        if address == row[1][0]:
            idx = df_update_zipcode[df_update_zipcode['business_address'] == address].index
            df_update_zipcode.loc[idx,'business_postal_code'] = row[1][1]
          

In [42]:
df_update_zipcode['business_postal_code'].isnull().sum()

59

In [43]:
df['business_postal_code'].isnull().sum()

159

## Let's clean up df_update_zipcode; change zip code to integer

In [50]:
df_update_zipcode.head(2)

Unnamed: 0.1,Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,...,short_inspect_date,short_violation_id,inspect_date,p1_3,p4_6,p7_9,p10_12,p13_18,p19_24,p25_36
0,0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,-122.420455,"(37.789784, -122.420455)",...,20170928,103131,2017-09-28,0,0,5,0,0,6,5
1,1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,-122.419066,"(37.759174, -122.419066)",...,20161206,103157,2016-12-06,0,6,0,0,5,5,3


In [55]:
m520 = df_update_zipcode['business_postal_code'].isnull()

In [70]:
s522 = df_update_zipcode[~m520]['business_postal_code']

In [95]:
for n in s522:
    if isinstance(n, str) and len(n) > 4:
        num = int(n)
    elif isinstance(n, float):
        num = int(n)
    else:
        print(n)

Ca
941
0
CA


In [88]:
if isinstance('109', str):
    num = int('109')