In [550]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

import random

import socket

import pandas as pd
import numpy as np
import json

import random

from scipy import spatial

import utilities

from py2neo import Graph

In [551]:
"""
Creating a matrix of distances from each store (from demand.csv) to each coordinate point (from address.csv) 

"""

# Pull in all addresses 

addresses = pd.read_csv("../data/address_book.csv")

# Filtering for just Chicago, dropping unnecessary columns

addresses = addresses[addresses["PLACENAME"]=="Chicago"][["ADDRDELIV","LATITUDE","LONGITUDE"]]
addresses = addresses.reset_index()
addresses.drop(["index"], inplace=True, axis=1)

# Take a random sample of coordinates to reduce the number of variables of the optimization problem

rand_index = random.sample(range(0, 582676), 1000)
address_sample = addresses.iloc[rand_index] # these will become our optimization variables
address_sample.shape

(1000, 3)

In [552]:
# Full in store-level data from the demand model

demand = pd.read_csv("demand.csv")
store_matrix = demand[["sales_volume_location_2016","abi","neighborhood_avg_property_value","neighborhood_property_crimes",
                             "surrounding_neighborhood_avg_property_value","surrounding_neighborhood_property_crimes","latitude_2017","longitude_2017"]]



In [553]:
# Filter randomly sampled coordinates into ndarray

address_lat = np.array(address_sample["LATITUDE"])
address_long = np.array(address_sample["LONGITUDE"])
address_coords = np.transpose(np.vstack((address_lat,address_long)))

# Filter store coordinates into ndarray

store_matrix_lat = np.array(store_matrix["latitude_2017"])
store_matrix_long = np.array(store_matrix["longitude_2017"])
store_coords = np.transpose(np.vstack((store_matrix_lat,store_matrix_long)))


# Run them throw scipy's handy pairwise distance function

distance_matrix = scipy.spatial.distance.cdist(store_coords, address_coords)
distance_matrix.shape

(745, 1000)

In [554]:
""" 

Adding neighborhood labels to each of the randomly selected coordinate points

"""

with open('dicts/neighborhood_polys.json','r') as f:
    neighborhoods = json.load(f)

zipped_coords = list(zip(address_sample["LONGITUDE"],address_sample["LATITUDE"]))

coord_neighborhood = []

for i in range(len(zipped_coords)):
    result = utilities.point_lookup(neighborhoods, zipped_coords[i])
    coord_neighborhood.append(result)
        
address_sample["neighborhood"] = coord_neighborhood
address_sample
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE,neighborhood
231813,2432 WEST JACKSON BOULEVARD,41.877739,-87.687585,United Center
357470,5134 SOUTH MERRIMAC AVENUE,41.798698,-87.778621,Garfield Ridge
195370,1614 WEST CARMEN AVENUE,41.974511,-87.670266,Uptown
204854,5557 NORTH SHERIDAN ROAD,41.983490,-87.654605,Edgewater
164799,1830 WEST FOSTER AVENUE,41.976277,-87.675993,Lincoln Square
...,...,...,...,...
189454,1700 NORTH HALSTED STREET,41.912865,-87.648641,Lincoln Park
445554,655 WEST 43RD PLACE,41.815112,-87.643010,New City
19191,6550 NORTH BOSWORTH AVENUE,42.001420,-87.669538,Rogers Park
491844,10326 SOUTH HOMAN AVENUE,41.705308,-87.706505,Mount Greenwood


In [555]:
# Generating neighborhood-wise avg property values and surrounding avg property values

property_neighborhoods = pd.read_csv("../data/properties_neighborhood_aggregated.csv")[['neighborhood','unit_zestimate']]
property_neighborhoods.rename(columns={'unit_zestimate':'avg_neighborhood_prop_val'}, inplace=True)

property_neighborhoods["surrounding_neighborhood_avg_prop_val"]=np.nan

for i in range(len(property_neighborhoods['neighborhood'])):
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(property_neighborhoods['neighborhood'][i])). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['avg_property_value'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        property_neighborhoods["surrounding_neighborhood_avg_prop_val"].iloc[i] = surrounding_mean
            
    except:
        property_neighborhoods["surrounding_neighborhood_avg_prop_val"] = np.nan

property_neighborhoods        

Unnamed: 0,neighborhood,avg_neighborhood_prop_val,surrounding_neighborhood_avg_prop_val
0,Albany Park,1508.167781,2724.111739
1,Andersonville,2003.563160,2495.194409
2,Archer Heights,839.866143,822.650534
3,Armour Square,264.469474,316.426096
4,Ashburn,153.938723,477.403433
...,...,...,...
90,West Ridge,4243.048035,2462.287584
91,West Town,5338.776832,1391.918080
92,Wicker Park,412.878641,2898.902546
93,Woodlawn,1665.015172,356.648953


In [556]:
# Generating neighborhood-wise number of property crimes and surrounding number of property crimes

crime_neighborhoods = pd.read_csv("../data/crime_neighborhood_aggregated.csv", header=None)
crime_neighborhoods.rename(columns={0:"neighborhood",1:"crime_type",2:"n_property_crimes"}, inplace=True)
crime_neighborhoods = crime_neighborhoods[crime_neighborhoods["crime_type"]=="PROPERTY_CRIME"][["neighborhood","n_property_crimes"]]
crime_neighborhoods = crime_neighborhoods.reset_index()
crime_neighborhoods.drop(['index'], axis=1, inplace=True)

crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"]=np.nan

for i in range(len(crime_neighborhoods['neighborhood'])):
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(crime_neighborhoods['neighborhood'][i])). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['n_property_crimes'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"].iloc[i] = surrounding_mean
            
    except:
        crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"].iloc[i] = np.nan
     
crime_neighborhoods    
    


Unnamed: 0,neighborhood,n_property_crimes,surrounding_neighborhood_avg_property_crimes
0,Albany Park,977,790.500000
1,Andersonville,198,1119.666667
2,Archer Heights,365,859.000000
3,Armour Square,228,638.000000
4,Ashburn,850,1267.500000
...,...,...,...
93,West Ridge,1356,978.000000
94,West Town,1398,1669.300000
95,Wicker Park,1392,1140.666667
96,Woodlawn,982,1260.000000


In [557]:
# First merge the property and crime matrices

neighborhood_matrix = pd.merge(property_neighborhoods, crime_neighborhoods, on="neighborhood")

# Then merge the sampled coordinates matrix with the neighborhood_matrix such that we are

neighborhood_matrix = address_sample.merge(neighborhood_matrix, on="neighborhood", how="left", left_index=True)

neighborhood_matrix

Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE,neighborhood,avg_neighborhood_prop_val,surrounding_neighborhood_avg_prop_val,n_property_crimes,surrounding_neighborhood_avg_property_crimes
82.0,2432 WEST JACKSON BOULEVARD,41.877739,-87.687585,United Center,231.330386,3182.483159,820.0,1970.750000
32.0,5134 SOUTH MERRIMAC AVENUE,41.798698,-87.778621,Garfield Ridge,365.097574,814.302113,740.0,662.200000
83.0,1614 WEST CARMEN AVENUE,41.974511,-87.670266,Uptown,2078.498504,2143.645753,1481.0,936.833333
25.0,5557 NORTH SHERIDAN ROAD,41.983490,-87.654605,Edgewater,1960.401059,2688.941624,1037.0,1091.600000
46.0,1830 WEST FOSTER AVENUE,41.976277,-87.675993,Lincoln Square,3446.683663,2355.732728,841.0,1113.333333
...,...,...,...,...,...,...,...,...
45.0,1700 NORTH HALSTED STREET,41.912865,-87.648641,Lincoln Park,2323.641218,2715.849043,2194.0,1324.875000
59.0,655 WEST 43RD PLACE,41.815112,-87.643010,New City,1129.471660,828.360686,1281.0,1038.333333
72.0,6550 NORTH BOSWORTH AVENUE,42.001420,-87.669538,Rogers Park,1672.914760,3101.724547,1582.0,1196.500000
57.0,10326 SOUTH HOMAN AVENUE,41.705308,-87.706505,Mount Greenwood,389.689347,258.094108,207.0,566.500000


In [558]:
"""

For certain random samples, the sampled coordinate point will not have socioeconomic features. In these cases, 
we'll have to remove the relevant rows from the neighborhood_matrix and corresponding columns from the distance 
matrix.

"""

nan_ix = list(np.argwhere(np.isnan(np.array(neighborhood_matrix.index)))[:,0])
not_null_ix = [z for z in range(distance_matrix.shape[1]) if z not in nan_ix]

if nan_ix:
    
    neighborhood_matrix = neighborhood_matrix.reset_index()
    neighborhood_matrix = neighborhood_matrix[~ neighborhood_matrix['index'].isna()]
    
    distance_matrix = distance_matrix[:,not_null_ix]
    
    
    

In [568]:
# Shapes of distaince matrix and neighborhood matrix after removal

print("distance shape:",distance_matrix.shape)
print("neighborhood shape:",neighborhood_matrix.shape)

distance shape: (745, 993)
neighborhood shape: (993, 5)


In [559]:
"""

Last, we need to create matrices of demand model coefficients for both constraints of the optimization problem

"""

with open('demand_coef.txt', 'r') as coefs:
    demand_coefs = coefs.readlines()

demand_coefs = [float(demand_coefs[1].split(",")[i].replace("]","").replace("[","").replace("\n","")) for i in range(len(demand_coefs[1].split(",")))]    
demand_coefs

[0.9317133788087207,
 -0.05389048407535676,
 -0.21656957369296354,
 0.031990880285334006,
 0.2202127995619555]

In [560]:
# matrix for the "d" constraint of the optimization problem

d_betas = np.transpose(np.repeat(np.array(demand_coefs),len(store_matrix)).reshape(5,len(store_matrix)))
d_betas.shape


(745, 5)

In [561]:
# matrix for the "l" constraint of the optimization problem
l_betas = np.transpose(np.repeat(np.array(demand_coefs[1:]),len(property_neighborhoods)).reshape(4,len(property_neighborhoods))) # we take everything but the 2016 sales covariate

l_betas.shape

(95, 4)

In [562]:
# Writing beta matrices to csv

np.savetxt('d_betas.csv', d_betas, delimiter=",")
np.savetxt('l_betas.csv', l_betas, delimiter=",")

In [563]:
# Writing final neighborhood matrix to csv

neighborhood_matrix.drop(['ADDRDELIV', 'LATITUDE', 'LONGITUDE', 'neighborhood'], axis=1, inplace=True)
neighborhood_matrix.to_csv("neighborhood_matrix_v2.csv")

In [564]:
# Writing final store matrix to csv

store_matrix.drop(['abi','latitude_2017','longitude_2017'], axis=1, inplace=True)
store_matrix.to_csv("store_matrix.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [565]:
# Writing final distance matrix to csv

np.savetxt('distance_matrix.csv', distance_matrix, delimiter=",")