### Import

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import re
import shapefile
from tqdm import tqdm

### Load zipcode files

In [3]:
sf1 = shapefile.Reader('../data/shapefiles/nyc_zipcodes/nyc_zipcodes_manhattan.shp')
sf2 = shapefile.Reader('../data/shapefiles/nyc_zipcodes/nyc_zipcodes_thebronx.shp')
sf3 = shapefile.Reader('../data/shapefiles/nyc_zipcodes/nyc_zipcodes_brooklyn.shp')
sf4 = shapefile.Reader('../data/shapefiles/nyc_zipcodes/nyc_zipcodes_queens.shp')
sf5 = shapefile.Reader('../data/shapefiles/nyc_zipcodes/nyc_zipcodes_statenisland.shp')
zips = []
zips.append(sf1.shapeRecords())
zips.append(sf2.shapeRecords())
zips.append(sf3.shapeRecords())
zips.append(sf4.shapeRecords())
zips.append(sf5.shapeRecords())

### Borough Codes

In [4]:
B = {}
B['manhattan'] = 1
B['thebronx'] = 2
B['brooklyn'] = 3
B['queens'] = 4
B['statenisland'] = 5

### Extract Zipcodes

In [5]:
Z = {}
for z,zipshape in enumerate(zips):
    zipcodes =[]
    for item in zipshape:
        record = item.record
        zipcodes.append(record[0])
    Z[z+1] = zipcodes

### Conversion API

In [6]:
class Converter:
    
    #----------------------------------------
    def __init__(self):
        
        #Load
        self.streets = pd.read_csv('../data/tables/nyc_streets_cleaned.csv')
        
        #Restrict
        self.streets = self.streets[['l_high_hn','l_low_hn','l_zip',\
                                     'r_high_hn','r_low_hn',\
                                     'r_zip','st_label']]

    #----------------------------------------
    def normalizeAddress(self,address):

        #Init
        number = '0'
        streetname = '0'
        lat = 0
        lng = 0

        #Uppercase
        address = address.upper()

        #Remove Appendices
        if '#' in address:
            pos = address.find(' #')
            address = address[:pos]
        if 'FRNT' in address:
            pos = address.find(' FRNT')
            address = address[:pos]
        if 'LBBY' in address:
            pos = address.find(' LBBY')
            address = address[:pos]
        if 'UNIT' in address:
            pos = address.find(' UNIT')
            address = address[:pos]
        if ' STE ' in address:
            pos = address.find(' STE ')
            address = address[:pos]
            
        #Clean some renamed avenues
        address = address.replace('SAINT ','ST ')
        address = address.replace('FDR DR','FRANKLIN D ROOSEVELT DR')
        address = address.replace('SAINT JOHNS LN','ST JOHNS LN')
        address = address.replace('LAGUARDIA PL','LA GUARDIA PL')
        address = address.replace('W MOUNT EDEN AVE','W MT EDEN AVE')
        address = address.replace('E MOUNT EDEN AVE','E MT EDEN AVE')        
        address = address.replace('JOHN F KENNEDY','JFK')
        address = address.replace('63RD DR','63 RD')
        address = address.replace('FASHION','7')
        address = address.replace('PARK AVENUE','PARK AVE')
        address = address.replace('JAMAICA AVENUE','JAMAICA AVE')
        address = address.replace('CROSSBAY','CROSS BAY')
        address = address.replace('SAINT MARKS AVE','ST MARKS AVE')        
        address = address.replace('WESTCHESTER SQ','WESTCHESTER AVE')
        address = address.replace('BEACH CHANNEL DR','BCH CHANNEL DR')    
        address = address.replace('FORT HAMILTON PKWY','FT HAMILTON PKWY')
        address = address.replace('GRAND CONCOURSE','GRAND CONC')

        #Split and correct
        elements = address.split(' ')
        if 'AVE' in elements:
            index = elements.index('AVE')
            streetname = ' '.join(elements[1:index+1])
            if (len(elements[-1]) == 1):
                streetname = ' '.join(elements[1:index+2])
            result = re.sub(r'[^0-9]+','', elements[1])
            if (len(result) > 0):
                streetname = streetname.replace('TH','')
                streetname = streetname.replace('ST','')    
                streetname = streetname.replace('RD','')
                streetname = streetname.replace('ND','')            
            number = elements[0]
        if 'CONC' in elements and 'GRAND' in elements:
            index = elements.index('GRAND')
            streetname = ' '.join(elements[1:index+2])
            number = elements[0]
        if ' BEACH ' in address:
            if ('BRIGHTON' not in address and 'ROCKAWAY' not in address):
                index = elements.index('BEACH')
                streetname = ' '.join(elements[1:index+1])
                number = elements[0]
                print(address,streetname,number)
        if 'RDWY' in elements:
            index = elements.index('RDWY')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'TPKE' in elements:
            index = elements.index('TPKE')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'BLVD' in elements:
            index = elements.index('BLVD')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'EXPY' in elements:
            index = elements.index('EXPY')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'CIR' in elements:
            index = elements.index('CIR')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'PKWY' in elements:
            index = elements.index('PKWY')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'PLZ' in elements:
            index = elements.index('PLZ')
            streetname = ' '.join(elements[1:index+1])
            if (elements[-1] == 'E'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'W'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'S'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'N'):
                streetname = ' '.join(elements[1:index+2])
            number = elements[0]
        if 'PL' in elements:
            index = elements.index('PL')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'LOOP' in elements:
            index = elements.index('LOOP')
            streetname = ' '.join(elements[1:index+1])
            if (elements[-1] == 'W'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'E'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'S'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'N'):
                streetname = ' '.join(elements[1:index+2])
            number = elements[0]
        if 'SQ' in elements:
            index = elements.index('SQ')
            streetname = ' '.join(elements[1:index+1])
            if (elements[-1] == 'W'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'E'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'S'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'N'):
                streetname = ' '.join(elements[1:index+2])
            number = elements[0]
        if 'CTR' in elements:
            index = elements.index('CTR')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'HWY' in elements:
            index = elements.index('HWY')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'DR' in elements:
            index = elements.index('DR')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'TER' in elements:
            index = elements.index('TER')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'BOARDWALK' in elements:
            index = elements.index('BOARDWALK')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
            if (elements[-1] == 'E'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'W'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'S'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'N'):
                streetname = ' '.join(elements[1:index+2])
        if 'CENTRAL PARK' in address:
            index = elements.index('PARK')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
            if (elements[-1] == 'E'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'W'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'S'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'N'):
                streetname = ' '.join(elements[1:index+2])
        if 'PARK AVE' in address:
            index = elements.index('AVE')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
            if (elements[-1] == 'E'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'W'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'S'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'N'):
                streetname = ' '.join(elements[1:index+2])
        if 'RD' in elements:
            index = elements.index('RD')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
            if (elements[-1] == 'E'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'W'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'S'):
                streetname = ' '.join(elements[1:index+2])
            if (elements[-1] == 'N'):
                streetname = ' '.join(elements[1:index+2])
        if 'LN' in elements:
            index = elements.index('LN')
            streetname = ' '.join(elements[1:index+1])
            number = elements[0]
        if 'ST JAMES PL' in address:
            streetname = ' '.join(elements[1:])
            number = elements[0]
        if 'BRIGHTON BEACH AVENUE' in address:
            streetname = ' '.join(elements[1:])
            number = elements[0]
        if 'PARK ROW' in address:
            streetname = ' '.join(elements[1:])
            number = elements[0]
        if 'AVENUE ' in address and 'AVENUE OF' not in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1:])
        if 'AVENUE OF' in address:
            address = address.replace('AVENUE','AVE')
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1:])
        if ' ST' in address:
            elements = address.split(' ')
            number = elements[0]
            if (len(elements) == 4):
                elements[2] = elements[2].replace('TH','')
                elements[2] = elements[2].replace('ND','')
                elements[2] = elements[2].replace('RD','')
                elements[2] = elements[2].replace('ST','')            
            if (len(elements) == 3):
                result = re.sub(r'[^0-9]+','', elements[1])
                if (len(result) > 0):
                    elements[1] = elements[1].replace('TH','')
                    elements[1] = elements[1].replace('ND','')
                    elements[1] = elements[1].replace('RD','')
                    elements[1] = elements[1].replace('ST','')            
            streetname = ' '.join(elements[1:])
        if 'W BROADWAY' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1:])
        if 'E HOUSTON' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1:])
        if 'W HOUSTON' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1:])
        if 'E BROADWAY' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1:])
        if 'CASTLE HILL AVENUE' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1:])
        if 'BOWERY' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1:])
        if 'BROADWAY' in address and 'W BROADWAY' not in address and 'E BROADWAY' not in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = elements[1]
        if 'EXPWY' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = elements[1:]
        if 'CO-OP CITY BLVD' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1:])
        if 'ST JOHNS LN' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1:])
        if 'GRAND CENTRAL TERMINAL' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = elements[1]
            lat = 40.7527262
            lng = -73.9794181
        if '1 PENN PLZ' in address:
            elements = address.split(' ')
            number = elements[0]
            streetname = ' '.join(elements[1])
            lat = 40.7513773
            lng = -73.9946771
        if 'ROCKEFELLER PL' in address:
            elements = address.split(' ')
            number = 30
            streetname = elements[1]
            lat = 40.759307
            lng = -73.9821456
        if 'JFK' in address:
            elements = address.split(' ')
            number = 1
            streetname = 'JFK INTERNATIONAL AIRPORT'
            lat = 40.6290576
            lng = -73.7865234
        if 'PENN PL' in address:
            elements = address.split(' ')
            number = 1
            streetname = 'PENN PLAZA'
            lat = 40.7501172
            lng = -73.994768
        if '1 WORLD TRADE CTR' in address:
            elements = address.split(' ')
            number = 1
            streetname = 'WORLD TRADE CTR'
            lat = 40.7129947
            lng = -74.0153496
        if '1 BATTERY PARK PL' in address:
            elements = address.split(' ')
            number = 1
            streetname = 'BATTERY PARK PL'
            lat = 40.7015621
            lng = -74.0216759
        if 'METROTECH CTR' in address:
            elements = address.split(' ')
            number = 6
            streetname = 'METROTECH CTR'
            lat = 40.6942793
            lng = -73.9886868
        if 'KINGS PL' in address:
            elements = address.split(' ')
            number = 5100
            streetname = 'KINGS PL'
            lat = 40.6098308
            lng = -73.9217725

        #Return
        return number,streetname,lat,lng,address
        
    #----------------------------------------
    def convert(self,datapoint):
        
        #Find all segments 
        streetindices = []
        zipcode = datapoint.zipcode
        city = datapoint.city
        number,streetname,lat,lng,address = self.normalizeAddress(datapoint.address)
        st_labels = self.streets['st_label'].values
        for s,st_label in enumerate(st_labels):
            if st_label == streetname:
                streetindices.append(s)
             
        #Return
        mytuple = (number,streetname,address,lat,lng,city,str(int(zipcode)),streetindices)
        return mytuple
    
    #----------------------------------------
    def findSegment(self,mytuple,Z,verbose):

        #Unpack
        dp = []
        number,streetname,address,lat,lng,city,zipcode,streetindices = mytuple
        subset = self.streets.iloc[streetindices]
        lzip = subset['l_zip'].values.astype(int).astype(str)
        rzip = subset['r_zip'].values.astype(int).astype(str)
        refindices = []
        for i in range(lzip.shape[0]):
            if (zipcode == lzip[i] or zipcode == rzip[i]):
                refindices.append(i)
        subset_res = subset.iloc[refindices]
        subsetsize = subset_res.shape[0]
        index = -1

        #If Zipcode not found
        if (subset_res.shape[0] == 0 and lat == 0):
            index = -2
            print(streetname,address,city)
            print(zipcode)
            print(np.unique(lzip))
            print(np.unique(rzip))
            
        #Else
        else:

            #Get High and Lows
            number = str(number).replace('-0','')
            try:
                number = int(number)
            except:
                number = 0

            #Get Highs and lows
            l_high_hn = subset_res['l_high_hn'].values
            l_low_hn = subset_res['l_low_hn'].values
            r_high_hn = subset_res['r_high_hn'].values
            r_low_hn = subset_res['r_low_hn'].values

            if (verbose == True):print('------------')
            
            #Loop through pairs to find correct line segment
            for i in range(len(l_high_hn)):
                lvalues = []
                rvalues = []
                if (verbose == True):print(' ')                    
                if (verbose == True):print(l_high_hn[i])
                if (verbose == True):print(l_low_hn[i])
                if (verbose == True):print(r_high_hn[i])
                if (verbose == True):print(r_low_hn[i])
                if (str(l_high_hn[i]) != 'nan'):
                    try:
                        val = l_high_hn[i]
                        if '-' in val:
                            val = int(str(number).replace('-0',''))
#                             elements = val.split('-')
#                             val = int(str(int(elements[0])) + str(int(elements[1])))
                        else:
                            val = int(val)
                        lvalues.append(val)
                    except:
                        pass
                if (str(l_low_hn[i]) != 'nan'):
                    try:
                        val = l_low_hn[i]
                        if '-' in val:
                            val = int(str(number).replace('-0',''))
#                             elements = val.split('-')
#                             val = int(str(int(elements[0])) + str(int(elements[1])))
                        else:
                            val = int(val)
                        lvalues.append(val)
                    except:
                        pass
                if (str(r_high_hn[i]) != 'nan'):
                    try:
                        val = r_high_hn[i]
                        if '-' in val:
                            val = int(str(number).replace('-0',''))
#                             elements = val.split('-')
#                             val = int(str(int(elements[0])) + str(int(elements[1])))
                        else:
                            val = int(val)
                        rvalues.append(val)
                    except:
                        pass
                if (str(r_low_hn[i]) != 'nan'):
                    try:
                        val = r_low_hn[i]
                        if '-' in val:
                            val = int(str(number).replace('-0',''))
#                             elements = val.split('-')
#                             val = int(str(int(elements[0])) + str(int(elements[1])))
                        else:
                            val = int(val)
                        rvalues.append(val)
                    except:
                        pass

                #If we have a hit
                if (verbose == True):print(lvalues)
                if (verbose == True):print(rvalues)
                values = []
                if (len(lvalues) == len(rvalues)-1):
                    lvalues.append(rvalues[-1]-1)
                if (len(rvalues) == len(lvalues)-1):
                    rvalues.append(lvalues[-1]-1)

                values = lvalues + rvalues
                values = np.array(values)
    
                if (verbose == True):print('--')
                ind1 = np.where(np.array(lvalues) == 0)[0]
                ind2 = np.where(np.array(rvalues) == 0)[0]
                if (len(ind1) == 1):
                    if (verbose == True):print('left contains one zero')
                if (len(ind2) == 1):
                    if (verbose == True):print('right contains one zero')
                if (len(ind1) == 2):
                    if (verbose == True):print('left contains two zeros')
                    indices = np.where(values > 0)[0]
                    values = values[indices].tolist()
                if (len(ind2) == 2):
                    if (verbose == True):print('right contains two zeros')
                    indices = np.where(values > 0)[0]
                    values = values[indices].tolist()
                if (verbose == True):print(values)
                if (len(values) > 0):
                    minval = np.min(values)
                    maxval = np.max(values)
                    if (verbose == True):print(minval,maxval)
                    if (verbose == True):print(number)                        
                    if (number >= minval and number <= maxval):
                        if (verbose == True):('found')
                        index = refindices[i]
                        break

            #Isolate index in full street table
            if (index > -1):
                index = streetindices[index]
                dp = self.streets.iloc[index]
        
        #Return
        return index,subsetsize,subset,subset_res,dp

### Load chains and create converter object

In [7]:
data = pd.read_csv('../data/tables/chains_cleaned_no_airports_corrected.csv')
converter = Converter()

### Find Address in Street Database and return indices of all line segments

In [8]:
chainbag = []
streets = pd.read_csv('../data/tables/nyc_streets_cleaned.csv')
for i in range(data.shape[0]):
    datapoint = data.iloc[i]
    mytuple = converter.convert(datapoint)
    number,streetname,address,lat,lng,city,zipcode,streetindices = mytuple
    if (len(streetindices) == 0 and lat == 0):
        print(i,datapoint.address,' - ',address)
    chainbag.append(mytuple)

### Determine the line segment closest to the chain address

In [9]:
newdata = []
baddata = []
verbose = False
streets = pd.read_csv('../data/tables/nyc_streets_cleaned.csv')
count = 0
for m in range(len(chainbag)):
    mytuple = chainbag[m]
    dp = data.iloc[m]
    number,streetname,address,lat,lng,city,zipcode,streetindices = mytuple
    index,subsetsize,subset,subset_res,streetdp = converter.findSegment(mytuple,Z,verbose)
    if (index == -2 and lat == 0):
        print(dp)
        print(m,'failure 1')
        count = count + 1
    if (index == -1 and lat == 0):
        print(dp)
        print(m,'failure 2')
        count = count + 1
    if (index > -1 or lat > 0):
        newdata.append((dp.chain,number,streetname,address,lat,lng,city,zipcode,index))

### Store

In [10]:
newdata = pd.DataFrame(newdata)
newdata.columns = ['chain','number','streetname','address','lat','lng','city','zipcode','stindex']
newdata.to_csv('../data/tables/chains_with_line_indices.csv',index=False)