In [244]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('seaborn-notebook')
%matplotlib inline

In [245]:
df = pd.read_csv('data/data_zipcode_imported.csv')

In [246]:
# SF business locations file
df_b = pd.read_csv('data/Registered_Business_Locations_-_San_Francisco.csv')

In [247]:
df_b_3cols = df_b[['Street Address', 'Source Zipcode', 'Business Start Date']]

In [248]:
# let's identify '1051 Market St' business transactions in SF business loc
df_b_3cols[df_b_3cols['Street Address'] == '1051 Market St']

Unnamed: 0,Street Address,Source Zipcode,Business Start Date
65775,1051 Market St,94103.0,02/01/2008
135617,1051 Market St,94103.0,12/17/2014
214983,1051 Market St,94103.0,09/01/2013


In [249]:
# Let's get all the addresses
address_lst = df['business_address'].tolist()

In [250]:
len(address_lst)

5229

In [251]:
address_unique_lst = list(set(address_lst))

In [252]:
len(address_unique_lst)

4829

In [253]:
df_otg = df[df['business_address'] == 'Off The Grid']

In [254]:
len(df_otg)

41

In [255]:
# No address or unreal addresses will be assigned with the average startdate.

In [256]:
for address in address_unique_lst:
    df_b_3unique = df_b_3cols[df_b_3cols['Street Address'] == address]
    if len(df_b_3unique) > 0:
        num_turnovers = len(df_b_3unique)
        latest_startdate = pd.to_datetime(max(df_b_3unique['Business Start Date'].values))
        # let's append these info onto df
        idx = df[df['business_address'] == address].index
        df.loc[idx,'number_turnovers'] = num_turnovers
        df.loc[idx,'start_date'] = latest_startdate

In [257]:
df_test = df_b_3cols[df_b_3cols['Street Address'] == '1051 Market St']

In [258]:
df_test

Unnamed: 0,Street Address,Source Zipcode,Business Start Date
65775,1051 Market St,94103.0,02/01/2008
135617,1051 Market St,94103.0,12/17/2014
214983,1051 Market St,94103.0,09/01/2013


In [259]:
df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,...,inspect_date,p1_3,p4_6,p7_9,p10_12,p13_18,p19_24,p25_36,number_turnovers,start_date
0,0,0,0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,...,2017-09-28,0,0,5,0,0,6,5,1.0,2003-02-28
1,1,1,1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,...,2016-12-06,0,6,0,0,5,5,3,,NaT


In [260]:
df[df['business_address'] == '1051 Market St']

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,...,inspect_date,p1_3,p4_6,p7_9,p10_12,p13_18,p19_24,p25_36,number_turnovers,start_date
85,85,85,115,81509,The Flying Falafel,1051 Market St,San Francisco,CA,94103,,...,2016-12-05,5,0,0,0,4,0,4,3.0,2014-12-17


### Let's input the average value to missing number_trunovers and start_dates

In [261]:
df[df['number_turnovers'].isnull()].head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,...,inspect_date,p1_3,p4_6,p7_9,p10_12,p13_18,p19_24,p25_36,number_turnovers,start_date
1,1,1,1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,...,2016-12-06,0,6,0,0,5,5,3,,NaT
3,3,3,3,73840,L'acajou Bakery and Cafe,498 09th St Ste. C,San Francisco,CA,94103,,...,2017-12-07,0,5,0,0,6,0,0,,NaT


In [262]:
mask_turnovers = df['number_turnovers'].isnull()

In [263]:
# sum(~mask_turnovers) is the number of True in number_turnovers
avg_turnover = sum(df['number_turnovers'][~mask_turnovers])/sum(~mask_turnovers)

In [264]:
df[df['start_date'].isnull()].head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,...,inspect_date,p1_3,p4_6,p7_9,p10_12,p13_18,p19_24,p25_36,number_turnovers,start_date
1,1,1,1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,...,2016-12-06,0,6,0,0,5,5,3,,NaT
3,3,3,3,73840,L'acajou Bakery and Cafe,498 09th St Ste. C,San Francisco,CA,94103,,...,2017-12-07,0,5,0,0,6,0,0,,NaT


In [265]:
df.loc[mask_turnovers, 'number_turnovers'] = avg_turnover

In [266]:
df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,...,inspect_date,p1_3,p4_6,p7_9,p10_12,p13_18,p19_24,p25_36,number_turnovers,start_date
0,0,0,0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,...,2017-09-28,0,0,5,0,0,6,5,1.0,2003-02-28
1,1,1,1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,...,2016-12-06,0,6,0,0,5,5,3,4.447019,NaT


In [267]:
# convert datetime in start_date to integer to use it for modeling

In [268]:
df.loc[:,'start_date'] = df.loc[:,'start_date'].dt.strftime('%Y%m%d')

In [269]:
df.loc[~mask_turnovers,'start_date'] = df.loc[~mask_turnovers,'start_date'].astype(int)

In [270]:
avg_startdate = sum(df['start_date'][~mask_turnovers].values)/sum(~mask_turnovers)

In [271]:
df.loc[mask_turnovers, 'start_date'] = int(avg_startdate)

In [272]:
df.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,...,inspect_date,p1_3,p4_6,p7_9,p10_12,p13_18,p19_24,p25_36,number_turnovers,start_date
0,0,0,0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,...,2017-09-28,0,0,5,0,0,6,5,1.0,20030228
1,1,1,1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,...,2016-12-06,0,6,0,0,5,5,3,4.447019,20049608
2,2,2,2,79782,Deli 23,2449 23rd St,San Francisco,CA,94110,,...,2016-05-03,4,0,2,0,3,3,2,3.0,20110615
3,3,3,3,73840,L'acajou Bakery and Cafe,498 09th St Ste. C,San Francisco,CA,94103,,...,2017-12-07,0,5,0,0,6,0,0,4.447019,20049608
4,4,4,4,76437,Sweetheart Cafe,909 Grant Ave,San Francisco,CA,94108,,...,2016-03-29,0,11,0,0,0,6,6,2.0,20100401
