In [65]:
import sys
sys.path.append('/Users/chrisolen/Documents/uchicago_courses/optimization/project/urban-demand-allocation')

import random

import pandas as pd
import numpy as np
import json

import random

from scipy import spatial

import utilities

from py2neo import Graph

random.seed(10)

In [66]:
# pull in store-level data from the demand model
demand_model = pd.read_csv("../demand_models/demand_model.csv")

# pull in all addresses
addresses = pd.read_csv("../../data/address_book.csv")

# pull in locality shapefiles
with open('../../data/geo_shape_files/neighborhood_reformatted.json','r') as f:
    neighborhoods = json.load(f)
    
# connect to graph db
uri = "bolt://localhost:7687"
graph = Graph(uri, auth=("neo4j", "password"))    

In [67]:
def sample_addresses(addresses, localities, locality_type, sample_size=10000):
    
    # filtering for just Chicago, dropping unnecessary columns
    addresses = addresses[addresses["PLACENAME"]=="Chicago"][["ADDRDELIV","LATITUDE","LONGITUDE"]]
    addresses = addresses.reset_index()
    addresses.drop(["index"], inplace=True, axis=1)
    
    # take a random sample of coordinates to reduce the number of variables of the optimization problem
    rand_index = random.sample(range(0, 582676), sample_size)
    # these will become our optimization variables
    address_sample = addresses.iloc[rand_index] 
    # zip coordinates for locality look up
    zipped_coords = list(zip(address_sample["LONGITUDE"],address_sample["LATITUDE"]))
    
    coord_locality = []
        
    for j in range(len(zipped_coords)):
        result = utilities.point_lookup(localities, zipped_coords[j])
        coord_locality.append(result)
        
    address_sample["{}".format(locality_type)] = coord_locality
    
    return address_sample

def generate_distance_matrix(address_sample, demand_model):

    # filter randomly sampled coordinates into ndarray
    address_lat = np.array(address_sample["LATITUDE"])
    address_long = np.array(address_sample["LONGITUDE"])
    address_coords = np.transpose(np.vstack((address_lat,address_long)))

    # filter store coordinates into ndarray
    demand_model_lat = np.array(demand_model["latitude"])
    demand_model_long = np.array(demand_model["longitude"])
    store_coords = np.transpose(np.vstack((demand_model_lat,demand_model_long)))
    
    # run them throw scipy's handy pairwise distance function
    distance_matrix = spatial.distance.cdist(store_coords, address_coords)
    
    return distance_matrix
    
    
    
    

In [68]:
address_sample = sample_addresses(addresses, neighborhoods, "neighborhood", sample_size=10000)
distance_matrix = generate_distance_matrix(address_sample, demand_model)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [74]:

address_matrix = graph_to_opt_matrix(graph, address_sample, "zestimate", neighborhoods, "neighborhood")
address_matrix = graph_to_opt_matrix(graph, address_matrix, "primary_type", neighborhoods, "neighborhood")
address_matrix = graph_to_opt_matrix(graph, address_matrix, "zestimate", neighborhoods, "neighborhood", edge_relation="NEXT_TO")
address_matrix = graph_to_opt_matrix(graph, address_matrix, "primary_type", neighborhoods, "neighborhood", edge_relation="NEXT_TO")

In [75]:
address_matrix

Unnamed: 0,ADDRDELIV,LATITUDE,LONGITUDE,neighborhood,zestimate,primary_type,zestimate_NEXT_TO,primary_type_NEXT_TO
34167,3423 NORTH OCTAVIA AVENUE,41.942233,-87.809053,Dunning,273667.092694,1426.0,,
449722,5523 SOUTH UNIVERSITY AVENUE,41.794321,-87.597818,Hyde Park,643831.639821,1863.0,,
506002,11800 SOUTH SANGAMON STREET,41.679434,-87.645899,West Pullman,82995.820492,3910.0,,
15552,6536 NORTH GLENWOOD AVENUE,42.001044,-87.665930,Rogers Park,310540.029641,3654.0,,
216109,1839 SOUTH AVERS AVENUE,41.855934,-87.720879,North Lawndale,174195.314741,9029.0,,
...,...,...,...,...,...,...,...,...
278645,1354 NORTH LEAVITT STREET,41.906557,-87.682505,Wicker Park,770058.138376,2340.0,546237.059383,2876.833333
435526,1155 EAST 58TH STREET,41.789249,-87.597676,Hyde Park,643831.639821,1863.0,301787.799402,2149.600000
236146,1114 NORTH CENTRAL PARK AVENUE,41.901417,-87.716774,Humboldt Park,255649.456716,9022.0,411416.449143,4833.000000
462941,8549 SOUTH SEELEY AVENUE,41.737839,-87.673802,Auburn Gresham,105200.677405,7153.0,137678.860480,5471.375000


In [87]:
address_matrix.iloc[800]

ADDRDELIV               2646 WEST CATALPA AVENUE
LATITUDE                                 41.9815
LONGITUDE                               -87.6961
neighborhood                      Lincoln Square
zestimate                                 639063
primary_type                                1892
zestimate_NEXT_TO                         545250
primary_type_NEXT_TO                     2373.44
Name: 56199, dtype: object

In [73]:
def graph_to_opt_matrix(graph, address_frame, feature, localities, locality_type, edge_relation=None):
	
	"""
	:graph: neo4j object from py2neo
	:address_frame: pandas dataframe of features predicting demand (sales) for each relevant address
	:feature: str, location-based feature to be added from the neo4j graph to the address_frame (e.g. avg_property_value)
	:localities: json of locality shape coordinates of search area 
	:locality_type: str, locality type corresponding to the node types to which we're restricting the query (e.g. neighborhood or tract)
	Returns: updated demand dataframe with new feature column
	"""
	
	# pull long_lat coordinates for each relevant address
	address_coordinates = list(zip(address_frame["LONGITUDE"],address_frame["LATITUDE"]))
	# create new column for feature; rename feature if edge relationship True
	if edge_relation:
		modified_feature = feature + "_" + edge_relation
		address_frame[modified_feature] = np.nan
	else:
		address_frame[feature] = np.nan
	# and empty list for those coordinates outside of the immediate search area 
	# (determined by the localities shapefiles)
	outside_search_area = []
	# iterate through lat_long pairs for each address
	for i in range(len(address_coordinates)):

		# get location label based on localities shape file
		point_location = utilities.point_lookup(localities,address_coordinates[i])
		# pull out the feature associated with the locality that the address is located within
		try:
			if not edge_relation:

				
				result = float(dict(pd.DataFrame(graph.run('match (a:{}) \
												where a.name = "{}" return a'.format(locality_type,point_location)). \
												to_table()).iloc[0,0])[feature])
				## coordinates and df indices should be the same ## 
				address_frame[feature].iloc[i] = result
				
			else:
				
				result = pd.DataFrame(graph.run('match (a:{})-[:{}]->(b) \
												where a.name = "{}" \
												return b'.format(locality_type,edge_relation,point_location)). \
												to_table())
				# count number of edge relations returned
				n_edge_relations = len(result[0])
				edge_features = []
				# pull out each of the feature values for each of the edge relations 
				for j in range(n_edge_relations):
					edge_feature = float(dict(result[0][j])[feature])
					edge_features.append(edge_feature)
				# average over edge relation feature values, ignoring any NaNs
				mean_of_edge_features = np.nanmean(edge_features)
		
				## coordinates and df indices should be the same ## 
				demand[modified_feature].iloc[i] = mean_of_edge_features
		
		# it may be that the coordinates don't match any of the localities associated with the search area
		except:
			outside_search_area.append((i, address_coordinates[i]))
	 
	# for the coordinates that lie (usually barely) outside the search area 
	for i in range(len(outside_search_area)): 
	
		point_location = utilities.closest_to(localities,outside_search_area[i][1])
	
		if not edge_relation:
				
			result = float(dict(pd.DataFrame(graph.run('match (a:{}) \
											where a.name = "{}" return a'.format(locality_type,point_location)). \
											to_table()).iloc[0,0])[feature])

			## coordinates and df indices should be the same ## 
			address_frame[feature].iloc[outside_search_area[i][0]] = result
				
		else:
				
			result = pd.DataFrame(graph.run('match (a:{})-[:{}]->(b) \
											where a.name = "{}" \
											return b'.format(locality_type,edge_relation,point_location)). \
											to_table())
			# count number of edge relations returned
			n_edge_relations = len(result[0])
			edge_features = []
			# pull out each of the feature values for each of the edge relations 
			for j in range(n_edge_relations):
				edge_feature = float(dict(result[0][j])[feature])
				edge_features.append(edge_feature)
			# average over edge relation feature values, ignoring any NaNs
			mean_of_edge_features = np.nanmean(edge_features)
		
			## coordinates and df indices should be the same ## 
			address_frame[modified_feature].iloc[outside_search_area[i][0]] = mean_of_edge_features
		
	return address_frame	

In [41]:
property_neighborhoods = pd.read_csv("../../data/properties_neighborhood_aggregated.csv")[['neighborhood','unit_zestimate']]
property_neighborhoods.rename(columns={'unit_zestimate':'avg_neighborhood_prop_val'}, inplace=True)
property_neighborhoods

Unnamed: 0,neighborhood,avg_neighborhood_prop_val
0,Albany Park,1508.167781
1,Andersonville,2003.563160
2,Archer Heights,839.866143
3,Armour Square,264.469474
4,Ashburn,153.938723
...,...,...
90,West Ridge,4243.048035
91,West Town,5338.776832
92,Wicker Park,412.878641
93,Woodlawn,1665.015172


In [11]:
"""

For certain random samples, the sampled coordinate point will not have socioeconomic features. In these cases, 
we'll have to remove the relevant rows from the neighborhood_matrix and corresponding columns from the distance 
matrix.

"""

nan_ix = list(np.argwhere(np.isnan(np.array(neighborhood_matrix.index)))[:,0])
nan_ix1 = list(np.argwhere(np.isnan(np.array(neighborhood_matrix['avg_neighborhood_prop_val'])))[:,0])
nan_ix2 = list(np.argwhere(np.isnan(np.array(neighborhood_matrix['surrounding_neighborhood_avg_prop_val'])))[:,0])
nan_ix3 = list(np.argwhere(np.isnan(np.array(neighborhood_matrix['n_property_crimes'])))[:,0])
nan_ix4 = list(np.argwhere(np.isnan(np.array(neighborhood_matrix['surrounding_neighborhood_avg_property_crimes'])))[:,0])

nan_ix = nan_ix + nan_ix1 + nan_ix2 + nan_ix3 + nan_ix4
nan_ix = set(nan_ix)

not_null_ix = [z for z in range(distance_matrix.shape[1]) if z not in nan_ix]

if nan_ix:
    
    neighborhood_matrix = neighborhood_matrix.reset_index()
    neighborhood_matrix = neighborhood_matrix.iloc[not_null_ix,]
    
    distance_matrix = distance_matrix[:,not_null_ix]
    
    coordinate_matrix = address_sample.iloc[not_null_ix,]
    
    
    
    
    

In [12]:
# Shapes of distaince matrix and neighborhood matrix after removal

print("distance shape:",distance_matrix.shape)
print("neighborhood shape:",neighborhood_matrix.shape)

distance shape: (745, 9937)
neighborhood shape: (9937, 9)


In [13]:
"""

Last, we need to create matrices of demand model coefficients for both constraints of the optimization problem

"""

with open('opt_variables/D_demand_coef.txt', 'r') as D_coefs:
    D_demand_coefs = D_coefs.readlines()

D_demand_coefs = [float(D_demand_coefs[1].split(",")[i].replace("]","").replace("[","").replace("\n","")) for i in range(len(D_demand_coefs[1].split(",")))]    


with open('opt_variables/L_demand_coef.txt', 'r') as L_coefs:
    L_demand_coefs = L_coefs.readlines()

L_demand_coefs = [float(L_demand_coefs[1].split(",")[i].replace("]","").replace("[","").replace("\n","")) for i in range(len(L_demand_coefs[1].split(",")))]    



In [14]:
# Vector of coefficients for the "d" constraint of the optimization problem

d_betas = D_demand_coefs

# Vector of coefficients for the "l" constraint of the optimization problem

l_betas = L_demand_coefs

# Writing beta matrices to csv

np.savetxt('opt_variables/d_betas.csv', d_betas, delimiter=",")
np.savetxt('opt_variables/l_betas.csv', l_betas, delimiter=",")

In [15]:
# Writing final neighborhood matrix to csv

neighborhood_matrix.drop(['index','ADDRDELIV', 'LATITUDE', 'LONGITUDE', 'neighborhood'], axis=1, inplace=True)
neighborhood_matrix = np.array(neighborhood_matrix)
np.savetxt('opt_variables/neighborhood_matrix.csv', neighborhood_matrix, delimiter=",")

In [16]:
# Writing final store matrix to csv

store_matrix.drop(['abi','latitude','longitude'], axis=1, inplace=True)
store_matrix = np.array(store_matrix)
np.savetxt('opt_variables/store_matrix.csv', store_matrix, delimiter=",")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
# Writing final distance matrix to csv

np.savetxt('opt_variables/distance_matrix.csv', distance_matrix, delimiter=",")

In [18]:
# Writing final sample coordiante matrix to csv

coordinate_matrix.to_csv("opt_variables/coordinate_matrix.csv", header=True)

In [19]:
# Writing l2 norms to csv

D_length = np.linalg.norm(np.matmul(d_betas,np.matmul(np.transpose(store_matrix), distance_matrix)))
L_length = np.linalg.norm(np.matmul(np.transpose(l_betas),np.transpose(neighborhood_matrix)))
dic = {"D_length":[D_length],"L-length":[L_length]}
l2_norms = pd.DataFrame(dic)
l2_norms.to_csv("opt_variables/l2_norms.csv", index=False)

In [45]:
property_neighborhoods

Unnamed: 0,neighborhood,avg_neighborhood_prop_val
0,Albany Park,1508.167781
1,Andersonville,2003.563160
2,Archer Heights,839.866143
3,Armour Square,264.469474
4,Ashburn,153.938723
...,...,...
90,West Ridge,4243.048035
91,West Town,5338.776832
92,Wicker Park,412.878641
93,Woodlawn,1665.015172
