In [323]:
import random

import pandas as pd
import numpy as np
import json

import random

from scipy import spatial

import utilities

from py2neo import Graph

random.seed(10)

In [324]:
"""
Creating a matrix of distances from each store (from demand.csv) to each coordinate point (from address.csv) 

"""
sample_size = 150000

# Pull in all addresses 

addresses = pd.read_csv("../data/address_book.csv")

# Filtering for just Chicago, dropping unnecessary columns

addresses = addresses[addresses["PLACENAME"]=="Chicago"][["ADDRDELIV","LATITUDE","LONGITUDE"]]
addresses = addresses.reset_index()
addresses.drop(["index"], inplace=True, axis=1)

# Take a random sample of coordinates to reduce the number of variables of the optimization problem

rand_index = random.sample(range(0, 582676), sample_size)
address_sample = addresses.iloc[rand_index] # these will become our optimization variables
address_sample.shape

(150000, 3)

In [325]:
# Full in store-level data from the demand model

demand = pd.read_csv("demand.csv")
store_matrix = demand[["sales_volume_location_2016","abi","neighborhood_avg_property_value","neighborhood_property_crimes",
                             "surrounding_neighborhood_avg_property_value","surrounding_neighborhood_property_crimes","latitude_2017","longitude_2017"]]



In [326]:
# Filter randomly sampled coordinates into ndarray

address_lat = np.array(address_sample["LATITUDE"])
address_long = np.array(address_sample["LONGITUDE"])
address_coords = np.transpose(np.vstack((address_lat,address_long)))

# Filter store coordinates into ndarray

store_matrix_lat = np.array(store_matrix["latitude_2017"])
store_matrix_long = np.array(store_matrix["longitude_2017"])
store_coords = np.transpose(np.vstack((store_matrix_lat,store_matrix_long)))


# Run them throw scipy's handy pairwise distance function

distance_matrix = spatial.distance.cdist(store_coords, address_coords)
distance_matrix.shape

(745, 150000)

In [327]:
""" 

Adding neighborhood labels to each of the randomly selected coordinate points

"""

with open('dicts/neighborhood_polys.json','r') as f:
    neighborhoods = json.load(f)

zipped_coords = list(zip(address_sample["LONGITUDE"],address_sample["LATITUDE"]))

coord_neighborhood = []

for i in range(len(zipped_coords)):
    result = utilities.point_lookup(neighborhoods, zipped_coords[i])
    coord_neighborhood.append(result)
        
address_sample["neighborhood"] = coord_neighborhood
address_sample
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE,neighborhood
34167,3423 NORTH OCTAVIA AVENUE,41.942233,-87.809053,Dunning
449722,5523 SOUTH UNIVERSITY AVENUE,41.794321,-87.597818,Hyde Park
506002,11800 SOUTH SANGAMON STREET,41.679434,-87.645899,West Pullman
15552,6536 NORTH GLENWOOD AVENUE,42.001044,-87.665930,Rogers Park
216109,1839 SOUTH AVERS AVENUE,41.855934,-87.720879,North Lawndale
...,...,...,...,...
475659,129 EAST 69TH STREET,41.769098,-87.620502,Grand Crossing
364394,5329 SOUTH NEWLAND AVENUE,41.794991,-87.795040,Garfield Ridge
70350,3800 NORTH TROY STREET,41.950410,-87.707141,Irving Park
243032,94 SOUTH MENARD AVENUE,41.879044,-87.770125,Austin


In [328]:
# Connect to graph db

uri = "bolt://localhost:7687"
graph = Graph(uri, auth=("neo4j", "password"))


In [329]:
# Generating neighborhood-wise avg property values and surrounding avg property values

property_neighborhoods = pd.read_csv("../data/properties_neighborhood_aggregated.csv")[['neighborhood','unit_zestimate']]
property_neighborhoods.rename(columns={'unit_zestimate':'avg_neighborhood_prop_val'}, inplace=True)

property_neighborhoods["surrounding_neighborhood_avg_prop_val"]=np.nan

for i in range(len(property_neighborhoods['neighborhood'])):
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(property_neighborhoods['neighborhood'][i])). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['avg_property_value'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        property_neighborhoods["surrounding_neighborhood_avg_prop_val"].iloc[i] = surrounding_mean
            
    except:
        property_neighborhoods["surrounding_neighborhood_avg_prop_val"] = np.nan

property_neighborhoods        

Unnamed: 0,neighborhood,avg_neighborhood_prop_val,surrounding_neighborhood_avg_prop_val
0,Albany Park,1508.167781,2724.111739
1,Andersonville,2003.563160,2495.194409
2,Archer Heights,839.866143,822.650534
3,Armour Square,264.469474,316.426096
4,Ashburn,153.938723,477.403433
...,...,...,...
90,West Ridge,4243.048035,2462.287584
91,West Town,5338.776832,1391.918080
92,Wicker Park,412.878641,2898.902546
93,Woodlawn,1665.015172,356.648953


In [330]:
# Generating neighborhood-wise number of property crimes and surrounding number of property crimes

crime_neighborhoods = pd.read_csv("../data/crime_neighborhood_aggregated.csv", header=None)
crime_neighborhoods.rename(columns={0:"neighborhood",1:"crime_type",2:"n_property_crimes"}, inplace=True)
crime_neighborhoods = crime_neighborhoods[crime_neighborhoods["crime_type"]=="PROPERTY_CRIME"][["neighborhood","n_property_crimes"]]
crime_neighborhoods = crime_neighborhoods.reset_index()
crime_neighborhoods.drop(['index'], axis=1, inplace=True)

crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"]=np.nan

for i in range(len(crime_neighborhoods['neighborhood'])):
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(crime_neighborhoods['neighborhood'][i])). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['n_property_crimes'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"].iloc[i] = surrounding_mean
            
    except:
        crime_neighborhoods["surrounding_neighborhood_avg_property_crimes"].iloc[i] = np.nan
     
crime_neighborhoods    
    


Unnamed: 0,neighborhood,n_property_crimes,surrounding_neighborhood_avg_property_crimes
0,Albany Park,977,790.500000
1,Andersonville,198,1119.666667
2,Archer Heights,365,859.000000
3,Armour Square,228,638.000000
4,Ashburn,850,1267.500000
...,...,...,...
93,West Ridge,1356,978.000000
94,West Town,1398,1669.300000
95,Wicker Park,1392,1140.666667
96,Woodlawn,982,1260.000000


In [331]:
# First merge the property and crime matrices

neighborhood_matrix = pd.merge(property_neighborhoods, crime_neighborhoods, on="neighborhood")

# Then merge the sampled coordinates matrix with the neighborhood_matrix such that we are

neighborhood_matrix = address_sample.merge(neighborhood_matrix, on="neighborhood", how="left", left_index=True)

neighborhood_matrix

Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE,neighborhood,avg_neighborhood_prop_val,surrounding_neighborhood_avg_prop_val,n_property_crimes,surrounding_neighborhood_avg_property_crimes
22.0,3423 NORTH OCTAVIA AVENUE,41.942233,-87.809053,Dunning,1269.001918,1697.587603,527.0,925.500000
40.0,5523 SOUTH UNIVERSITY AVENUE,41.794321,-87.597818,Hyde Park,311.398813,612.862771,926.0,687.600000
89.0,11800 SOUTH SANGAMON STREET,41.679434,-87.645899,West Pullman,72.620483,100.529277,1072.0,1028.000000
72.0,6536 NORTH GLENWOOD AVENUE,42.001044,-87.665930,Rogers Park,1672.914760,3101.724547,1582.0,1196.500000
61.0,1839 SOUTH AVERS AVENUE,41.855934,-87.720879,North Lawndale,2092.547355,2547.953444,2145.0,2091.200000
...,...,...,...,...,...,...,...,...
35.0,129 EAST 69TH STREET,41.769098,-87.620502,Grand Crossing,269.562012,488.817118,1809.0,1838.142857
32.0,5329 SOUTH NEWLAND AVENUE,41.794991,-87.795040,Garfield Ridge,365.097574,814.302113,740.0,662.200000
41.0,3800 NORTH TROY STREET,41.950410,-87.707141,Irving Park,1683.708461,3087.969458,1219.0,783.428571
6.0,94 SOUTH MENARD AVENUE,41.879044,-87.770125,Austin,515.472553,2333.287671,4146.0,1586.833333


In [332]:
"""

For certain random samples, the sampled coordinate point will not have socioeconomic features. In these cases, 
we'll have to remove the relevant rows from the neighborhood_matrix and corresponding columns from the distance 
matrix.

"""

nan_ix = list(np.argwhere(np.isnan(np.array(neighborhood_matrix.index)))[:,0])
nan_ix1 = list(np.argwhere(np.isnan(np.array(neighborhood_matrix['avg_neighborhood_prop_val'])))[:,0])
nan_ix2 = list(np.argwhere(np.isnan(np.array(neighborhood_matrix['surrounding_neighborhood_avg_prop_val'])))[:,0])
nan_ix3 = list(np.argwhere(np.isnan(np.array(neighborhood_matrix['n_property_crimes'])))[:,0])
nan_ix4 = list(np.argwhere(np.isnan(np.array(neighborhood_matrix['surrounding_neighborhood_avg_property_crimes'])))[:,0])

nan_ix = nan_ix + nan_ix1 + nan_ix2 + nan_ix3 + nan_ix4
nan_ix = set(nan_ix)

not_null_ix = [z for z in range(distance_matrix.shape[1]) if z not in nan_ix]

if nan_ix:
    
    neighborhood_matrix = neighborhood_matrix.reset_index()
    neighborhood_matrix = neighborhood_matrix.iloc[not_null_ix,]
    
    distance_matrix = distance_matrix[:,not_null_ix]
    
    coordinate_matrix = address_sample.iloc[not_null_ix,]
    
    
    
    
    

In [333]:
# Shapes of distaince matrix and neighborhood matrix after removal

print("distance shape:",distance_matrix.shape)
print("neighborhood shape:",neighborhood_matrix.shape)

distance shape: (745, 149123)
neighborhood shape: (149123, 9)


In [334]:
"""

Last, we need to create matrices of demand model coefficients for both constraints of the optimization problem

"""

with open('D_demand_coef.txt', 'r') as D_coefs:
    D_demand_coefs = D_coefs.readlines()

D_demand_coefs = [float(D_demand_coefs[1].split(",")[i].replace("]","").replace("[","").replace("\n","")) for i in range(len(D_demand_coefs[1].split(",")))]    


with open('L_demand_coef.txt', 'r') as L_coefs:
    L_demand_coefs = L_coefs.readlines()

L_demand_coefs = [float(L_demand_coefs[1].split(",")[i].replace("]","").replace("[","").replace("\n","")) for i in range(len(L_demand_coefs[1].split(",")))]    



In [335]:
# Vector of coefficients for the "d" constraint of the optimization problem

d_betas = D_demand_coefs

# Vector of coefficients for the "l" constraint of the optimization problem

l_betas = L_demand_coefs

# Writing beta matrices to csv

np.savetxt('d_betas.csv', d_betas, delimiter=",")
np.savetxt('l_betas.csv', l_betas, delimiter=",")

In [336]:
# Writing final neighborhood matrix to csv

neighborhood_matrix.drop(['index','ADDRDELIV', 'LATITUDE', 'LONGITUDE', 'neighborhood'], axis=1, inplace=True)
neighborhood_matrix = np.array(neighborhood_matrix)
np.savetxt('neighborhood_matrix.csv', neighborhood_matrix, delimiter=",")

In [337]:
# Writing final store matrix to csv

store_matrix.drop(['abi','latitude_2017','longitude_2017'], axis=1, inplace=True)
store_matrix = np.array(store_matrix)
np.savetxt('store_matrix.csv', store_matrix, delimiter=",")

In [338]:
# Writing final distance matrix to csv

np.savetxt('distance_matrix.csv', distance_matrix, delimiter=",")

In [339]:
# Writing final sample coordiante matrix to csv

coordinate_matrix.to_csv("coordinate_matrix.csv", header=True)

In [340]:
# Writing l2 norms to csv

D_length = np.linalg.norm(np.matmul(d_betas,np.matmul(np.transpose(store_matrix), distance_matrix)))
L_length = np.linalg.norm(np.matmul(np.transpose(l_betas),np.transpose(neighborhood_matrix)))
dic = {"D_length":[D_length],"L-length":[L_length]}
l2_norms = pd.DataFrame(dic)
l2_norms.to_csv("l2_norms.csv", index=False)