In [609]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

import random

import socket

import pandas as pd
import numpy as np
import json

import random

from scipy import spatial

import utilities

from py2neo import Graph

In [610]:
"""
Creating a matrix of distances from each store (from demand.csv) to each coordinate point (from address.csv) 

"""

# Pull in all addresses 

addresses = pd.read_csv("../data/address_book.csv")

# Filtering for just Chicago, dropping unnecessary columns

addresses = addresses[addresses["PLACENAME"]=="Chicago"][["ADDRDELIV","LATITUDE","LONGITUDE"]]
addresses = addresses.reset_index()
addresses.drop(["index"], inplace=True, axis=1)

# Take a random sample of coordinates to reduce the number of variables of the optimization problem

rand_index = random.sample(range(0, 582676), 1000)
address_sample = addresses.iloc[rand_index] # these will become our optimization variables
address_sample.shape

(1000, 3)

In [611]:
# Full in store-level data from the demand model

demand = pd.read_csv("demand.csv")
store_matrix = demand[["sales_volume_location_2016","abi","neighborhood_avg_property_value","neighborhood_property_crimes",
                             "surrounding_neighborhood_avg_property_value","surrounding_neighborhood_property_crimes","latitude_2017","longitude_2017"]]



In [612]:
# Filter randomly sampled coordinates into ndarray

address_lat = np.array(address_sample["LATITUDE"])
address_long = np.array(address_sample["LONGITUDE"])
address_coords = np.transpose(np.vstack((address_lat,address_long)))

# Filter store coordinates into ndarray

store_matrix_lat = np.array(store_matrix["latitude_2017"])
store_matrix_long = np.array(store_matrix["longitude_2017"])
store_coords = np.transpose(np.vstack((store_matrix_lat,store_matrix_long)))


# Run them throw scipy's handy pairwise distance function

distance_matrix = scipy.spatial.distance.cdist(store_coords, address_coords)
distance_matrix.shape

(745, 1000)

In [613]:
""" 

Adding neighborhood labels to each of the randomly selected coordinate points

"""

with open('dicts/neighborhood_polys.json','r') as f:
    neighborhoods = json.load(f)

zipped_coords = list(zip(address_sample["LONGITUDE"],address_sample["LATITUDE"]))

coord_neighborhood = []

for i in range(len(zipped_coords)):
    result = utilities.point_lookup(neighborhoods, zipped_coords[i])
    coord_neighborhood.append(result)
        
address_sample["neighborhood"] = coord_neighborhood
address_sample
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE,neighborhood
151418,7122 WEST BERWYN AVENUE,41.977301,-87.805723,Norwood Park
378457,6216 SOUTH KEDZIE AVENUE,41.780203,-87.703518,Chicago Lawn
416585,4800 SOUTH FORRESTVILLE AVENUE,41.807493,-87.612935,Grand Boulevard
558723,916 EAST 132ND STREET,41.656055,-87.600133,Riverdale
242290,723 NORTH CENTRAL AVENUE,41.893650,-87.764889,Austin
...,...,...,...,...
397339,4954 SOUTH KARLOV AVENUE,41.802567,-87.726144,Archer Heights
278909,1826 WEST CULLERTON STREET,41.855156,-87.672166,Lower West Side
157617,2500 NORTH MASON AVENUE,41.925789,-87.774843,Belmont Cragin
56354,2222 NORTH MANGO AVENUE,41.921015,-87.769780,Belmont Cragin


In [614]:
# Generating neighborhood-wise avg property values and surrounding avg property values

property_neighborhoods = pd.read_csv("../data/properties_neighborhood_aggregated.csv")[['neighborhood','unit_zestimate']]
property_neighborhoods.rename(columns={'unit_zestimate':'avg_neighborhood_prop_val'}, inplace=True)

property_neighborhoods["surrounding_neighborhood_avg_prop_val"]=np.nan

for i in range(len(property_neighborhoods['neighborhood'])):
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(property_neighborhoods['neighborhood'][i])). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['avg_property_value'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        property_neighborhoods["surrounding_neighborhood_avg_prop_val"].iloc[i] = surrounding_mean
            
    except:
        property_neighborhoods["surrounding_neighborhood_avg_prop_val"] = np.nan

property_neighborhoods        

Unnamed: 0,neighborhood,avg_neighborhood_prop_val,surrounding_neighborhood_avg_prop_val
0,Albany Park,1508.167781,2724.111739
1,Andersonville,2003.563160,2495.194409
2,Archer Heights,839.866143,822.650534
3,Armour Square,264.469474,316.426096
4,Ashburn,153.938723,477.403433
...,...,...,...
90,West Ridge,4243.048035,2462.287584
91,West Town,5338.776832,1391.918080
92,Wicker Park,412.878641,2898.902546
93,Woodlawn,1665.015172,356.648953


In [615]:
# Generating neighborhood-wise number of property crimes and surrounding number of property crimes

crime_neighborhoods = pd.read_csv("../data/crime_neighborhood_aggregated.csv", header=None)
crime_neighborhoods.rename(columns={0:"neighborhood",1:"crime_type",2:"n_property_crimes"}, inplace=True)
crime_neighborhoods = crime_neighborhoods[crime_neighborhoods["crime_type"]=="PROPERTY_CRIME"][["neighborhood","n_property_crimes"]]
crime_neighborhoods = crime_neighborhoods.reset_index()
crime_neighborhoods.drop(['index'], axis=1, inplace=True)

crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"]=np.nan

for i in range(len(crime_neighborhoods['neighborhood'])):
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(crime_neighborhoods['neighborhood'][i])). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['n_property_crimes'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"].iloc[i] = surrounding_mean
            
    except:
        crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"].iloc[i] = np.nan
     
crime_neighborhoods    
    


Unnamed: 0,neighborhood,n_property_crimes,surrounding_neighborhood_avg_property_crimes
0,Albany Park,977,790.500000
1,Andersonville,198,1119.666667
2,Archer Heights,365,859.000000
3,Armour Square,228,638.000000
4,Ashburn,850,1267.500000
...,...,...,...
93,West Ridge,1356,978.000000
94,West Town,1398,1669.300000
95,Wicker Park,1392,1140.666667
96,Woodlawn,982,1260.000000


In [616]:
# First merge the property and crime matrices

neighborhood_matrix = pd.merge(property_neighborhoods, crime_neighborhoods, on="neighborhood")

# Then merge the sampled coordinates matrix with the neighborhood_matrix such that we are

neighborhood_matrix = address_sample.merge(neighborhood_matrix, on="neighborhood", how="left", left_index=True)

neighborhood_matrix

Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE,neighborhood,avg_neighborhood_prop_val,surrounding_neighborhood_avg_prop_val,n_property_crimes,surrounding_neighborhood_avg_property_crimes
63.0,7122 WEST BERWYN AVENUE,41.977301,-87.805723,Norwood Park,3857.573461,3239.727885,488.0,364.250000
18.0,6216 SOUTH KEDZIE AVENUE,41.780203,-87.703518,Chicago Lawn,905.896552,314.315616,1886.0,1587.400000
34.0,4800 SOUTH FORRESTVILLE AVENUE,41.807493,-87.612935,Grand Boulevard,462.859658,194.916032,1031.0,568.571429
71.0,916 EAST 132ND STREET,41.656055,-87.600133,Riverdale,53.221667,74.883084,342.0,865.600000
6.0,723 NORTH CENTRAL AVENUE,41.893650,-87.764889,Austin,515.472553,2333.287671,4146.0,1586.833333
...,...,...,...,...,...,...,...,...
2.0,4954 SOUTH KARLOV AVENUE,41.802567,-87.726144,Archer Heights,839.866143,822.650534,365.0,859.000000
51.0,1826 WEST CULLERTON STREET,41.855156,-87.672166,Lower West Side,3678.797562,1547.565481,805.0,1115.555556
9.0,2500 NORTH MASON AVENUE,41.925789,-87.774843,Belmont Cragin,1413.947595,2173.302625,1532.0,1317.000000
9.0,2222 NORTH MANGO AVENUE,41.921015,-87.769780,Belmont Cragin,1413.947595,2173.302625,1532.0,1317.000000


In [617]:
"""

For certain random samples, the sampled coordinate point will not have socioeconomic features. In these cases, 
we'll have to remove the relevant rows from the neighborhood_matrix and corresponding columns from the distance 
matrix.

"""

nan_ix = list(np.argwhere(np.isnan(np.array(neighborhood_matrix.index)))[:,0])
not_null_ix = [z for z in range(distance_matrix.shape[1]) if z not in nan_ix]

if nan_ix:
    
    neighborhood_matrix = neighborhood_matrix.reset_index()
    neighborhood_matrix = neighborhood_matrix[~ neighborhood_matrix['index'].isna()]
    
    distance_matrix = distance_matrix[:,not_null_ix]
    
    
    

In [618]:
# Shapes of distaince matrix and neighborhood matrix after removal

print("distance shape:",distance_matrix.shape)
print("neighborhood shape:",neighborhood_matrix.shape)

distance shape: (745, 993)
neighborhood shape: (993, 9)


In [626]:
"""

Last, we need to create matrices of demand model coefficients for both constraints of the optimization problem

"""

with open('demand_coef.txt', 'r') as coefs:
    demand_coefs = coefs.readlines()

demand_coefs = [float(demand_coefs[1].split(",")[i].replace("]","").replace("[","").replace("\n","")) for i in range(len(demand_coefs[1].split(",")))]    
demand_coefs

[0.9317133788087207,
 -0.05389048407535676,
 -0.21656957369296354,
 0.031990880285334006,
 0.2202127995619555]

In [628]:
# matrix for the "d" constraint of the optimization problem

#d_betas = np.transpose(np.repeat(np.array(demand_coefs),len(store_matrix)).reshape(5,len(store_matrix)))
d_betas = demand_coefs


In [629]:
# matrix for the "l" constraint of the optimization problem

#l_betas = np.transpose(np.repeat(np.array(demand_coefs[1:]),len(property_neighborhoods)).reshape(4,len(property_neighborhoods))) # we take everything but the 2016 sales covariate
l_betas = demand_coefs[1:]


[-0.05389048407535676,
 -0.21656957369296354,
 0.031990880285334006,
 0.2202127995619555]

In [622]:
# Writing beta matrices to csv

np.savetxt('d_betas.csv', d_betas, delimiter=",")
np.savetxt('l_betas.csv', l_betas, delimiter=",")

In [623]:
# Writing final neighborhood matrix to csv

neighborhood_matrix.drop(['index','ADDRDELIV', 'LATITUDE', 'LONGITUDE', 'neighborhood'], axis=1, inplace=True)
neighborhood_matrix.to_csv("neighborhood_matrix.csv", index = False)

In [624]:
# Writing final store matrix to csv

store_matrix.drop(['abi','latitude_2017','longitude_2017'], axis=1, inplace=True)
store_matrix.to_csv("store_matrix.csv", index = False)

In [625]:
# Writing final distance matrix to csv

np.savetxt('distance_matrix.csv', distance_matrix, delimiter=",")