In [46]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

import random

import socket

import pandas as pd
import numpy as np
import json

import utilities

from py2neo import Graph

In [47]:
driver_ip = socket.gethostbyname(socket.gethostname())

In [48]:
conf = pyspark.SparkConf().setAll([('spark.kubernetes.authenticate.caCertFile', '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'), \
                                   ('spark.kubernetes.authenticate.oauthTokenFile','/var/run/secrets/kubernetes.io/serviceaccount/token'), \
                                   ('spark.kubernetes.authenticate.driver.serviceAccountName','spark-driver-sa'), \
                                   ('spark.kubernetes.namespace','spark'), \
                                   ('spark.driver.pod.name','spark-driver'), \
                                   ('spark.executor.instances','16'), \
                                   ('spark.kubernetes.container.image','gcr.io/sarcasm-3wx3ce6drvftuy/spark-v2.4.4-worker:latest'), \
                                   ('spark.driver.host','spark-driver.spark.svc.cluster.local'), \
                                   ('spark.driver.port','29413'), \
                                   ('spark.driver.bindAddress',driver_ip), \
                                   ('spark.executor.memory','6500m'), \
                                   ('spark.executor.cores','1'), \
                                   ('spark.kubernetes.driverEnv.GCS_PROJECT_ID', 'sarcasm-3wx3ce6drvftuy'), \
                                   ('spark.kubernetes.driverEnv.GOOGLE_APPLICATION_CREDENTIALS', '/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.kubernetes.driver.secrets.sarc-bucket-sa','/mnt/secrets'), \
                                   ('spark.kubernetes.executor.secrets.sarc-bucket-sa','/mnt/secrets'), \
                                   ('spark.executorEnv.GCS_PROJECT_ID','sarcasm-3wx3ce6drvftuy'), \
                                   ('spark.executorEnv.GOOGLE_APPLICATION_CREDENTIALS','/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.hadoop.google.cloud.auth.service.account.enable','true'), \
                                   ('spark.hadoop.google.cloud.auth.service.account.json.keyfile','/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.hadoop.fs.gs.project.id','sarcasm-3wx3ce6drvftuy'), \
                                   ('spark.hadoop.fs.gs.system.bucket','sarc-bucket-2wx3ce6drvftuy')])

In [None]:
spark = SparkSession.builder.master("k8s://https://kubernetes.default.svc.cluster.local:443").appName("sarc").config(conf=conf).getOrCreate()
sc = spark.sparkContext

In [21]:
%time addresses = spark.read.csv("gs://sarc-bucket-3wx3ce6drvftuy/address_book.csv", inferSchema=True, sep = ',')


In [22]:
with open('dicts/zoning_polys.json','r') as f:
    zoning = json.load(f)

In [23]:
"""
Allowable zones:

B1 - Neighborhood Shopping District, **Low-Traffic Street, More Storefront-Like**
B2 - Neighborhood Mixed-Use District, "" ""
B3 - Community Shopping District 
C1 - Neighborhood Commercial District
C2 - Motor Vehicle-Related Commercial District
DC - Downtown Core District
DR - Downtown Residential District
DS - Downtown Service District
DX - Downtown Mixed-Use District
PD - Planned Development

"""

'\nAllowable zones:\n\nB1 - Neighborhood Shopping District, **Low-Traffic Street, More Storefront-Like**\nB2 - Neighborhood Mixed-Use District, "" ""\nB3 - Community Shopping District \nC1 - Neighborhood Commercial District\nC2 - Motor Vehicle-Related Commercial District\nDC - Downtown Core District\nDR - Downtown Residential District\nDS - Downtown Service District\nDX - Downtown Mixed-Use District\nPD - Planned Development\n\n'

In [24]:
allowable_zones = ["B1","B2","B3","C1","C2","DC","DR","DS","DX","PD"]

In [25]:
chi_addresses = addresses.filter(addresses["PLACENAME"]=="Chicago")
chi_addresses = chi_addresses[["ADDRDELIV","LATITUDE","LONGITUDE"]]
chi_addresses

Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE
233417,7042 NORTH OZARK AVENUE,42.009040,-87.820313
233418,6908 NORTH OWEN AVENUE,42.005777,-87.819278
233419,6947 NORTH OLCOTT AVENUE,42.007494,-87.813372
233430,7420 NORTH ORIOLE AVENUE,42.015760,-87.816638
233431,7401 NORTH OTTAWA AVENUE,42.015236,-87.817216
...,...,...,...
1153713,2901 EAST 104TH STREET,41.706431,-87.553693
1153714,3457 EAST 100TH STREET,41.714011,-87.538661
1153715,4000 EAST 106TH STREET,41.702820,-87.526800
1153716,9054 SOUTH BRANDON AVENUE,41.730346,-87.547176


In [74]:
potential_zones = []
for i in range((len(list(zoning.keys())))):
    
    if zoning[list(zoning.keys())[i]][1][:2] in allowable_zones:
        potential_zones.append(list(zoning.keys())[i])
        
    
    

In [75]:
vals = [zoning[potential_zones[i]] for i in range(len(potential_zones))]


In [76]:
potential_zones = {i:j for (i,j) in zip(potential_zones, vals)}

In [77]:
address_coords = list(zip(chi_addresses['LATITUDE'],chi_addresses['LONGITUDE']))
possible_coords = []

for i in range(len(address_coords)):

    result = utilities.point_lookup(potential_zones,address_coords[i])
    
    if result:
        possible_coords.append(address_coords[i])

KeyboardInterrupt: 

In [47]:
# connect to graph db

uri = "bolt://localhost:7687"
graph = Graph(uri, auth=("neo4j", "password"))



In [3]:
demand = pd.read_csv("demand.csv")

In [13]:
store_matrix = demand[["sales_volume_location_2016","abi","neighborhood_avg_property_value","neighborhood_property_crimes",
                             "surrounding_neighborhood_avg_property_value","surrounding_neighborhood_property_crimes"]]

In [14]:
store_matrix

Unnamed: 0,sales_volume_location_2016,abi,neighborhood_avg_property_value,neighborhood_property_crimes,surrounding_neighborhood_avg_property_value,surrounding_neighborhood_property_crimes
0,1858.0,150832897,7745.637617,5611.0,354.767758,1033.777778
1,5310.0,151026986,396.288139,2539.0,2899.660156,1565.333333
2,796.0,151036118,3678.797562,805.0,1547.565481,1115.555556
3,5310.0,151053519,90.695323,245.0,461.767532,1240.500000
4,13276.0,151054939,3678.797562,805.0,1547.565481,1115.555556
...,...,...,...,...,...,...
1083,2655.0,402005936,3200.595561,1268.0,2499.178164,762.857143
1084,796.0,151602547,163.266502,745.0,261.121950,952.400000
1085,1858.0,611130287,1683.708461,1219.0,3087.969458,783.428571
1086,2920.0,458531563,3678.797562,805.0,1547.565481,1115.555556


In [43]:
property_neighborhoods = pd.read_csv("../data/properties_neighborhood_aggregated.csv")[['neighborhood','unit_zestimate']]
property_neighborhoods.rename(columns={'unit_zestimate':'avg_neighborhood_prop_val'}, inplace=True)
property_neighborhoods

Unnamed: 0,neighborhood,avg_neighborhood_prop_val
0,Albany Park,1508.167781
1,Andersonville,2003.563160
2,Archer Heights,839.866143
3,Armour Square,264.469474
4,Ashburn,153.938723
...,...,...
90,West Ridge,4243.048035
91,West Town,5338.776832
92,Wicker Park,412.878641
93,Woodlawn,1665.015172


In [58]:
property_neighborhoods["surrounding_neighborhood_avg_prop_val"]=np.nan

for i in range(len(property_neighborhoods['neighborhood'])):
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(property_neighborhoods['neighborhood'][i])). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['avg_property_value'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        property_neighborhoods["surrounding_neighborhood_avg_prop_val"].iloc[i] = surrounding_mean
            
    except:
        property_neighborhoods["surrounding_neighborhood_avg_prop_val"] = np.nan
     
    
    


In [59]:
property_neighborhoods

Unnamed: 0,neighborhood,avg_neighborhood_prop_val,surrounding_neighborhood_avg_prop_val
0,Albany Park,1508.167781,2724.111739
1,Andersonville,2003.563160,2495.194409
2,Archer Heights,839.866143,822.650534
3,Armour Square,264.469474,316.426096
4,Ashburn,153.938723,477.403433
...,...,...,...
90,West Ridge,4243.048035,2462.287584
91,West Town,5338.776832,1391.918080
92,Wicker Park,412.878641,2898.902546
93,Woodlawn,1665.015172,356.648953


In [76]:
crime_neighborhoods = pd.read_csv("../data/crime_neighborhood_aggregated.csv", header=None)
crime_neighborhoods.rename(columns={0:"neighborhood",1:"crime_type",2:"n_property_crimes"}, inplace=True)
crime_neighborhoods = crime_neighborhoods[crime_neighborhoods["crime_type"]=="PROPERTY_CRIME"][["neighborhood","n_property_crimes"]]
crime_neighborhoods = crime_neighborhoods.reset_index()
crime_neighborhoods.drop(['index'], axis=1, inplace=True)
crime_neighborhoods

Unnamed: 0,neighborhood,n_property_crimes
0,Albany Park,977
1,Andersonville,198
2,Archer Heights,365
3,Armour Square,228
4,Ashburn,850
...,...,...
93,West Ridge,1356
94,West Town,1398
95,Wicker Park,1392
96,Woodlawn,982


In [77]:
crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"]=np.nan

for i in range(len(crime_neighborhoods['neighborhood'])):
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(crime_neighborhoods['neighborhood'][i])). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['n_property_crimes'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"].iloc[i] = surrounding_mean
            
    except:
        crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"].iloc[i] = np.nan
     
    
    


In [78]:
crime_neighborhoods

Unnamed: 0,neighborhood,n_property_crimes,surrounding_neighborhood_avg_property_crimes
0,Albany Park,977,790.500000
1,Andersonville,198,1119.666667
2,Archer Heights,365,859.000000
3,Armour Square,228,638.000000
4,Ashburn,850,1267.500000
...,...,...,...
93,West Ridge,1356,978.000000
94,West Town,1398,1669.300000
95,Wicker Park,1392,1140.666667
96,Woodlawn,982,1260.000000


In [87]:
neighborhood_matrix = pd.merge(crime_neighborhoods, crime_neighborhoods, on="neighborhood")

In [88]:
neighborhood_matrix

Unnamed: 0,neighborhood,n_property_crimes_x,surrounding_neighborhood_avg_property_crimes_x,n_property_crimes_y,surrounding_neighborhood_avg_property_crimes_y
0,Albany Park,977,790.500000,977,790.500000
1,Andersonville,198,1119.666667,198,1119.666667
2,Archer Heights,365,859.000000,365,859.000000
3,Armour Square,228,638.000000,228,638.000000
4,Ashburn,850,1267.500000,850,1267.500000
...,...,...,...,...,...
93,West Ridge,1356,978.000000,1356,978.000000
94,West Town,1398,1669.300000,1398,1669.300000
95,Wicker Park,1392,1140.666667,1392,1140.666667
96,Woodlawn,982,1260.000000,982,1260.000000
