In [46]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

import random

import socket

import pandas as pd
import numpy as np
import json

import utilities

In [47]:
driver_ip = socket.gethostbyname(socket.gethostname())

In [48]:
conf = pyspark.SparkConf().setAll([('spark.kubernetes.authenticate.caCertFile', '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'), \
                                   ('spark.kubernetes.authenticate.oauthTokenFile','/var/run/secrets/kubernetes.io/serviceaccount/token'), \
                                   ('spark.kubernetes.authenticate.driver.serviceAccountName','spark-driver-sa'), \
                                   ('spark.kubernetes.namespace','spark'), \
                                   ('spark.driver.pod.name','spark-driver'), \
                                   ('spark.executor.instances','16'), \
                                   ('spark.kubernetes.container.image','gcr.io/sarcasm-2wx3ce6drvftuy/spark-v2.4.4-worker:latest'), \
                                   ('spark.driver.host','spark-driver.spark.svc.cluster.local'), \
                                   ('spark.driver.port','29413'), \
                                   ('spark.driver.bindAddress',driver_ip), \
                                   ('spark.executor.memory','6500m'), \
                                   ('spark.executor.cores','1'), \
                                   ('spark.kubernetes.driverEnv.GCS_PROJECT_ID', 'sarcasm-2wx3ce6drvftuy'), \
                                   ('spark.kubernetes.driverEnv.GOOGLE_APPLICATION_CREDENTIALS', '/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.kubernetes.driver.secrets.sarc-bucket-sa','/mnt/secrets'), \
                                   ('spark.kubernetes.executor.secrets.sarc-bucket-sa','/mnt/secrets'), \
                                   ('spark.executorEnv.GCS_PROJECT_ID','sarcasm-2wx3ce6drvftuy'), \
                                   ('spark.executorEnv.GOOGLE_APPLICATION_CREDENTIALS','/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.hadoop.google.cloud.auth.service.account.enable','true'), \
                                   ('spark.hadoop.google.cloud.auth.service.account.json.keyfile','/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.hadoop.fs.gs.project.id','sarcasm-2wx3ce6drvftuy'), \
                                   ('spark.hadoop.fs.gs.system.bucket','sarc-bucket-2wx3ce6drvftuy')])

In [None]:
spark = SparkSession.builder.master("k8s://https://kubernetes.default.svc.cluster.local:443").appName("sarc").config(conf=conf).getOrCreate()
sc = spark.sparkContext

In [21]:
%time addresses = spark.read.csv("gs://sarc-bucket-2wx3ce6drvftuy/address_book.csv", inferSchema=True, sep = ',')


In [22]:
with open('dicts/zoning_polys.json','r') as f:
    zoning = json.load(f)

In [23]:
"""
Allowable zones:

B1 - Neighborhood Shopping District, **Low-Traffic Street, More Storefront-Like**
B2 - Neighborhood Mixed-Use District, "" ""
B3 - Community Shopping District 
C1 - Neighborhood Commercial District
C2 - Motor Vehicle-Related Commercial District
DC - Downtown Core District
DR - Downtown Residential District
DS - Downtown Service District
DX - Downtown Mixed-Use District
PD - Planned Development

"""

'\nAllowable zones:\n\nB1 - Neighborhood Shopping District, **Low-Traffic Street, More Storefront-Like**\nB2 - Neighborhood Mixed-Use District, "" ""\nB3 - Community Shopping District \nC1 - Neighborhood Commercial District\nC2 - Motor Vehicle-Related Commercial District\nDC - Downtown Core District\nDR - Downtown Residential District\nDS - Downtown Service District\nDX - Downtown Mixed-Use District\nPD - Planned Development\n\n'

In [24]:
allowable_zones = ["B1","B2","B3","C1","C2","DC","DR","DS","DX","PD"]

In [25]:
chi_addresses = addresses.filter(addresses["PLACENAME"]=="Chicago")
chi_addresses = chi_addresses[["ADDRDELIV","LATITUDE","LONGITUDE"]]
chi_addresses

Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE
233417,7042 NORTH OZARK AVENUE,42.009040,-87.820313
233418,6908 NORTH OWEN AVENUE,42.005777,-87.819278
233419,6947 NORTH OLCOTT AVENUE,42.007494,-87.813372
233430,7420 NORTH ORIOLE AVENUE,42.015760,-87.816638
233431,7401 NORTH OTTAWA AVENUE,42.015236,-87.817216
...,...,...,...
1153713,2901 EAST 104TH STREET,41.706431,-87.553693
1153714,3457 EAST 100TH STREET,41.714011,-87.538661
1153715,4000 EAST 106TH STREET,41.702820,-87.526800
1153716,9054 SOUTH BRANDON AVENUE,41.730346,-87.547176


In [74]:
potential_zones = []
for i in range((len(list(zoning.keys())))):
    
    if zoning[list(zoning.keys())[i]][1][:2] in allowable_zones:
        potential_zones.append(list(zoning.keys())[i])
        
    
    

In [75]:
vals = [zoning[potential_zones[i]] for i in range(len(potential_zones))]


In [76]:
potential_zones = {i:j for (i,j) in zip(potential_zones, vals)}

In [77]:
address_coords = list(zip(chi_addresses['LATITUDE'],chi_addresses['LONGITUDE']))
possible_coords = []

for i in range(len(address_coords)):

    result = utilities.point_lookup(potential_zones,address_coords[i])
    
    if result:
        possible_coords.append(address_coords[i])

KeyboardInterrupt: 