In [215]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

import random

import socket

import pandas as pd
import numpy as np
import json

import random

from scipy import spatial

import utilities

from py2neo import Graph

In [74]:
# connect to graph db

uri = "bolt://localhost:7687"
graph = Graph(uri, auth=("neo4j", "password"))



In [75]:
demand = pd.read_csv("demand.csv")

In [98]:
store_matrix = demand[["sales_volume_location_2016","abi","neighborhood_avg_property_value","neighborhood_property_crimes",
                             "surrounding_neighborhood_avg_property_value","surrounding_neighborhood_property_crimes","latitude_2017","longitude_2017"]]

In [284]:
store_matrix.to_csv("store_matrix.csv")

In [100]:
property_neighborhoods = pd.read_csv("../data/properties_neighborhood_aggregated.csv")[['neighborhood','unit_zestimate']]
property_neighborhoods.rename(columns={'unit_zestimate':'avg_neighborhood_prop_val'}, inplace=True)
property_neighborhoods

Unnamed: 0,neighborhood,avg_neighborhood_prop_val
0,Albany Park,1508.167781
1,Andersonville,2003.563160
2,Archer Heights,839.866143
3,Armour Square,264.469474
4,Ashburn,153.938723
...,...,...
90,West Ridge,4243.048035
91,West Town,5338.776832
92,Wicker Park,412.878641
93,Woodlawn,1665.015172


In [101]:
property_neighborhoods["surrounding_neighborhood_avg_prop_val"]=np.nan

for i in range(len(property_neighborhoods['neighborhood'])):
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(property_neighborhoods['neighborhood'][i])). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['avg_property_value'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        property_neighborhoods["surrounding_neighborhood_avg_prop_val"].iloc[i] = surrounding_mean
            
    except:
        property_neighborhoods["surrounding_neighborhood_avg_prop_val"] = np.nan
     
    
    


In [102]:
property_neighborhoods

Unnamed: 0,neighborhood,avg_neighborhood_prop_val,surrounding_neighborhood_avg_prop_val
0,Albany Park,1508.167781,2724.111739
1,Andersonville,2003.563160,2495.194409
2,Archer Heights,839.866143,822.650534
3,Armour Square,264.469474,316.426096
4,Ashburn,153.938723,477.403433
...,...,...,...
90,West Ridge,4243.048035,2462.287584
91,West Town,5338.776832,1391.918080
92,Wicker Park,412.878641,2898.902546
93,Woodlawn,1665.015172,356.648953


In [103]:
property_neighborhoods

Unnamed: 0,neighborhood,avg_neighborhood_prop_val,surrounding_neighborhood_avg_prop_val
0,Albany Park,1508.167781,2724.111739
1,Andersonville,2003.563160,2495.194409
2,Archer Heights,839.866143,822.650534
3,Armour Square,264.469474,316.426096
4,Ashburn,153.938723,477.403433
...,...,...,...
90,West Ridge,4243.048035,2462.287584
91,West Town,5338.776832,1391.918080
92,Wicker Park,412.878641,2898.902546
93,Woodlawn,1665.015172,356.648953


In [104]:
crime_neighborhoods = pd.read_csv("../data/crime_neighborhood_aggregated.csv", header=None)
crime_neighborhoods.rename(columns={0:"neighborhood",1:"crime_type",2:"n_property_crimes"}, inplace=True)
crime_neighborhoods = crime_neighborhoods[crime_neighborhoods["crime_type"]=="PROPERTY_CRIME"][["neighborhood","n_property_crimes"]]
crime_neighborhoods = crime_neighborhoods.reset_index()
crime_neighborhoods.drop(['index'], axis=1, inplace=True)
crime_neighborhoods

Unnamed: 0,neighborhood,n_property_crimes
0,Albany Park,977
1,Andersonville,198
2,Archer Heights,365
3,Armour Square,228
4,Ashburn,850
...,...,...
93,West Ridge,1356
94,West Town,1398
95,Wicker Park,1392
96,Woodlawn,982


In [105]:
crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"]=np.nan

for i in range(len(crime_neighborhoods['neighborhood'])):
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(crime_neighborhoods['neighborhood'][i])). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['n_property_crimes'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"].iloc[i] = surrounding_mean
            
    except:
        crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"].iloc[i] = np.nan
     
    
    


In [106]:
crime_neighborhoods

Unnamed: 0,neighborhood,n_property_crimes,surrounding_neighborhood_avg_property_crimes
0,Albany Park,977,790.500000
1,Andersonville,198,1119.666667
2,Archer Heights,365,859.000000
3,Armour Square,228,638.000000
4,Ashburn,850,1267.500000
...,...,...,...
93,West Ridge,1356,978.000000
94,West Town,1398,1669.300000
95,Wicker Park,1392,1140.666667
96,Woodlawn,982,1260.000000


In [107]:
neighborhood_matrix = pd.merge(crime_neighborhoods, crime_neighborhoods, on="neighborhood")

In [108]:
neighborhood_matrix

Unnamed: 0,neighborhood,n_property_crimes_x,surrounding_neighborhood_avg_property_crimes_x,n_property_crimes_y,surrounding_neighborhood_avg_property_crimes_y
0,Albany Park,977,790.500000,977,790.500000
1,Andersonville,198,1119.666667,198,1119.666667
2,Archer Heights,365,859.000000,365,859.000000
3,Armour Square,228,638.000000,228,638.000000
4,Ashburn,850,1267.500000,850,1267.500000
...,...,...,...,...,...
93,West Ridge,1356,978.000000,1356,978.000000
94,West Town,1398,1669.300000,1398,1669.300000
95,Wicker Park,1392,1140.666667,1392,1140.666667
96,Woodlawn,982,1260.000000,982,1260.000000


In [285]:
neighborhood_matrix.to_csv("neighborhood_matrix.csv")

In [109]:
addresses = pd.read_csv("../data/address_book.csv")

In [110]:
addresses = addresses[addresses["PLACENAME"]=="Chicago"][["ADDRDELIV","LATITUDE","LONGITUDE"]]

In [111]:
addresses = addresses.reset_index()
addresses.drop(["index"], inplace=True, axis=1)
addresses

Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE
0,7042 NORTH OZARK AVENUE,42.009040,-87.820313
1,6908 NORTH OWEN AVENUE,42.005777,-87.819278
2,6947 NORTH OLCOTT AVENUE,42.007494,-87.813372
3,7420 NORTH ORIOLE AVENUE,42.015760,-87.816638
4,7401 NORTH OTTAWA AVENUE,42.015236,-87.817216
...,...,...,...
582671,2901 EAST 104TH STREET,41.706431,-87.553693
582672,3457 EAST 100TH STREET,41.714011,-87.538661
582673,4000 EAST 106TH STREET,41.702820,-87.526800
582674,9054 SOUTH BRANDON AVENUE,41.730346,-87.547176


In [112]:
addresses.shape

(582676, 3)

In [113]:
rand_index = random.sample(range(0, 582676), 1000)

In [114]:
address_sample = addresses.iloc[rand_index]

In [115]:
address_sample

Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE
319358,620 NORTH CLARK STREET,41.893005,-87.631551
394901,8626 SOUTH KOSTNER AVENUE,41.735686,-87.731454
540057,9150 SOUTH GREEN STREET,41.727445,-87.645162
566805,11148 SOUTH INDIANA AVENUE,41.691161,-87.618547
277906,2311 WEST WASHINGTON BOULEVARD,41.882793,-87.684535
...,...,...,...
516206,834 WEST 115TH STREET,41.686473,-87.643902
313129,1411 WEST NORTH AVENUE,41.910459,-87.663369
116269,5000 WEST STRONG STREET,41.970893,-87.752858
285764,535 WEST NORTH AVENUE,41.910788,-87.642584


In [146]:
# number of coordinates to be used

address_sample.shape

(1000, 3)

In [199]:
# address coordinates

address_lat = np.array(address_sample["LATITUDE"])
address_long = np.array(address_sample["LONGITUDE"])
address_coords = np.transpose(np.vstack((address_lat,address_long)))


In [117]:
# number of grocery stores to be used

store_matrix.shape

(1088, 8)

In [196]:
# store coordinates

store_matrix_lat = np.array(store_matrix["latitude_2017"])
store_matrix_long = np.array(store_matrix["longitude_2017"])
store_coords = np.transpose(np.vstack((store_matrix_lat,store_matrix_long)))



In [156]:
store_matrix_coords.shape

(1088,)

In [286]:
distance_matrix = scipy.spatial.distance.cdist(store_coords, address_coords)

In [288]:

np.savetxt('distance_matrix.csv', distance_matrix, delimiter=",")

In [241]:
# save as txt file

with open('demand_coef.txt', 'r') as coefs:
    demand_coefs = coefs.readlines()


In [244]:
demand_coefs = [float(demand_coefs[1].split(",")[i].replace("]","").replace("[","").replace("\n","")) for i in range(len(demand_coefs[1].split(",")))]

In [245]:
demand_coefs

[0.9317133788087207,
 -0.05389048407535676,
 -0.21656957369296354,
 0.031990880285334006,
 0.2202127995619555]

In [279]:
d_betas = np.transpose(np.repeat(np.array(demand_coefs),len(store_matrix)).reshape(5,len(store_matrix)))

d_betas.shape


(1088, 5)

In [280]:
l_betas = np.transpose(np.repeat(np.array(demand_coefs[1:]),len(property_neighborhoods)).reshape(4,len(property_neighborhoods))) # we take everything but the 2016 sales covariate


l_betas.shape

(95, 4)

In [289]:
np.savetxt('d_betas.csv', d_betas, delimiter=",")
np.savetxt('l_betas.csv', l_betas, delimiter=",")