In [92]:
import pandas as pd
import numpy as np

import utilities
import json

from py2neo import Graph

from sklearn.linear_model import LinearRegression

In [93]:

cols=['year',
 'abi',
 'ticker',
 'company',
 'address_line_1',
 'city',
 'zipcode',
 'location_employee_size_code',
 'location_sales_volume_code',
 'primary_naics_code',
 'sic_code',
 'sic6_descriptions_sic',
 'business_status_code',
 'office_size_code',
 'company_holding_status',
 'parent_employee_size_code',
 'parent_sales_volume_code',
 'census_tract',
 'cbsa_code',
 'year_established',
 'employee_size_location',
 'sales_volume_location',
 'parent_actual_employee_size',
 'parent_actual_sales_volume',
 'latitude',
 'longitude']

bus = pd.read_csv("../data/chi_bus.csv", sep='\t', names=cols)

# Filtering for useful features

bus = bus.loc[:,['abi','primary_naics_code','company','year','business_status_code','company_holding_status',
           'census_tract','year_established',
           'employee_size_location','sales_volume_location',
           'latitude','longitude']]
bus.shape

  interactivity=interactivity, compiler=compiler, result=result)


(1144876, 12)

In [94]:
# Understanding business status code

"""
Business Status Code:

1: Headquarter
2: Branch
3: Subsidiary
9: Single Location

"""

bus['business_status_code'].value_counts()

9    1034553
2     101871
3       4241
1       4211
Name: business_status_code, dtype: int64

In [95]:
# Understanding business status holding

"""
Business Status Holding:

I'm gonna assume the 712 are publicly traded and the others aren't?

"""

bus['company_holding_status'].isna().value_counts()

True     1144164
False        712
Name: company_holding_status, dtype: int64

In [96]:
# Understanding company holding status

bus['company_holding_status'].isna().value_counts()

True     1144164
False        712
Name: company_holding_status, dtype: int64

In [97]:
# How many samples don't have naics codes?

bus['primary_naics_code'].isna().value_counts()

False    1143647
True        1229
Name: primary_naics_code, dtype: int64

In [98]:
# Remove naics code nulls and convert to string

bus = bus[~bus['primary_naics_code'].isna()]
bus['primary_naics_code'] = (bus['primary_naics_code'].astype(int)).astype(str)

In [99]:
# Filtering for naics codes

"""
Grocery Store-Related NAICS Codes:

NAICS CODE 445110: Supermarkets and Other Grocery (except Convenience)
NAICS CODE 447110: Gasoline Stations with Convenience Stores
NAICS CODE 445120: Convenience Stores

"""

#groc = bus[(bus['primary_naics_code'].str.contains('445110')) | (bus['primary_naics_code'].str.contains('447110')) | (bus['primary_naics_code'].str.contains('445120'))]
groc = bus[(bus['primary_naics_code'].str.contains('445110'))]
groc.shape

(10469, 12)

In [100]:
# Understanding year distribution 

groc['year'].value_counts()

2013    1430
2014    1257
2012    1213
2010    1125
2015    1122
2011    1109
2009    1107
2016     964
2017     826
2018     316
Name: year, dtype: int64

In [101]:
# Filtering for 2016 and 2017

groc_2016 = groc[groc['year']==2016]
groc_2017 = groc[groc['year']==2017]

# Concatenating 2016 and 2017 samples

demand = pd.merge(left=groc_2017, right=groc_2016, left_on='abi', right_on='abi')
demand.shape

(767, 23)

In [102]:
# Removing additional less-useful features

demand.drop(['primary_naics_code_y','company_y','year_y','business_status_code_y',
            'company_holding_status_y','census_tract_y','year_established_y',
            'company_y','year_y','business_status_code_y',
            'latitude_y','longitude_y'], inplace=True, axis=1)

cols = [demand.columns[i].replace("_x","_2017").replace("_y","_2016") for i in range(len(demand.columns))]
replacing = {i:j for (i,j) in zip(demand.columns,cols)}
demand.rename(columns=replacing, inplace=True)

In [103]:
# Connect to graph db

uri = "bolt://localhost:7687"
graph = Graph(uri, auth=("neo4j", "password"))



In [104]:
# Pulling out coordinates for each store

coordinates = list(zip(demand["longitude_2017"],demand["latitude_2017"]))

# Pulling neighborhood polygons
with open('dicts/neighborhood_polys.json','r') as f:
    neighborhoods = json.load(f)

In [105]:
# Loading average property value for the neighborhood in which the stores is located

demand['neighborhood_avg_property_value'] = np.nan

outside_city = []

for i in range(len(coordinates)):

    point_district = utilities.point_lookup(neighborhoods,coordinates[i])
    
    try:
        result = float(dict(pd.DataFrame(graph.run('match (a:neighborhood) where a.name = "{}" return a'.format(point_district)). \
                            to_table()).iloc[0,0])['avg_property_value'])
        ## coordinates and df indices should be the same ## 
        demand['neighborhood_avg_property_value'].iloc[i] = result
        
    except:
        outside_city.append((i, coordinates[i]))
     
    
    
for i in range(len(outside_city)): # for the few coordinates that lie just outside the city
    
    point_district = utilities.closest_to(neighborhoods,outside_city[i][1])
    
    result = float(dict(pd.DataFrame(graph.run('match (a:neighborhood) where a.name = "{}" return a'.format(point_district)). \
                            to_table()).iloc[0,0])['avg_property_value'])
    
    
    demand['neighborhood_avg_property_value'].iloc[outside_city[i][0]] = result
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [106]:
# Loading number of property crimes per neighborhood into demand model for each store

demand['neighborhood_property_crimes'] = np.nan

outside_city = []

for i in range(len(coordinates)):

    point_district = utilities.point_lookup(neighborhoods,coordinates[i])
    
    try:
        result = float(dict(pd.DataFrame(graph.run('match (a:neighborhood) where a.name = "{}" return a'.format(point_district)). \
                            to_table()).iloc[0,0])['n_property_crimes'])
        
        ## coordinates and df indices should be the same ## 
        demand['neighborhood_property_crimes'].iloc[i] = result
        
    except:
        outside_city.append((i, coordinates[i]))
     
    
for i in range(len(outside_city)): # for the few coordinates that lie just outside the city
    
    point_district = utilities.closest_to(neighborhoods,outside_city[i][1])
    
    result = float(dict(pd.DataFrame(graph.run('match (a:neighborhood) where a.name = "{}" return a'.format(point_district)). \
                            to_table()).iloc[0,0])['n_property_crimes'])
    
    
    demand['neighborhood_property_crimes'].iloc[outside_city[i][0]] = result
    

In [107]:
# Loading average property values of neighborhoods surrounding the neighborhood of the given store

demand['surrounding_neighborhood_avg_property_value'] = np.nan

outside_city = []

for i in range(len(coordinates)):

    point_district = utilities.point_lookup(neighborhoods,coordinates[i])
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(point_district)). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['avg_property_value'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        
        ## coordinates and df indices should be the same ## 
        demand['surrounding_neighborhood_avg_property_value'].iloc[i] = surrounding_mean
            
    except:
        outside_city.append((i, coordinates[i]))
     
    
    
for i in range(len(outside_city)): # for the few coordinates that lie just outside the city
    
    point_district = utilities.closest_to(neighborhoods,outside_city[i][1])
    
    result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(point_district)). \
                            to_table())

    n_next_door = len(result[0])

    neighboring_means = []

    for i in range(n_next_door):
        neighboring_mean = float(dict(result[0][i])['avg_property_value'])
        neighboring_means.append(neighboring_mean)
        
    surrounding_mean = np.nanmean(neighboring_means)
    
    demand['surrounding_neighborhood_avg_property_value'].iloc[outside_city[i][0]] = surrounding_mean
    

In [108]:
# Loading number of property crimes of neighborhoods surrounding the neighborhood of the given store

demand['surrounding_neighborhood_property_crimes'] = np.nan

outside_city = []

for i in range(len(coordinates)):

    point_district = utilities.point_lookup(neighborhoods,coordinates[i])

    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(point_district)). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['n_property_crimes'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        
        ## coordinates and df indices should be the same ## 
        demand['surrounding_neighborhood_property_crimes'].iloc[i] = surrounding_mean
            
    except:
        outside_city.append((i, coordinates[i]))
        
    
for i in range(len(outside_city)): # for the few coordinates that lie just outside the city
    
    point_district = utilities.closest_to(neighborhoods,outside_city[i][1])
    
    result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(point_district)). \
                            to_table())

    n_next_door = len(result[0])

    neighboring_means = []

    for i in range(n_next_door):
        neighboring_mean = float(dict(result[0][i])['n_property_crimes'])
        neighboring_means.append(neighboring_mean)
        
    surrounding_mean = np.nanmean(neighboring_means)
    
    demand['surrounding_neighborhood_property_crimes'].iloc[outside_city[i][0]] = surrounding_mean
    

In [113]:
# Least squares model for 'D' component of objective function:

D_demand = demand[(demand['sales_volume_location_2017'].notna()) &
               (demand['sales_volume_location_2016'].notna()) &
               (demand['neighborhood_property_crimes'].notna()) &
               (demand['neighborhood_avg_property_value'].notna()) &
               (demand['surrounding_neighborhood_avg_property_value'].notna()) &
               (demand['surrounding_neighborhood_property_crimes'].notna())]

D_demand_features = ["sales_volume_location_2016","neighborhood_avg_property_value",
                     "neighborhood_property_crimes",
           "surrounding_neighborhood_avg_property_value","surrounding_neighborhood_property_crimes"]

D_X = np.array(D_demand[D_demand_features])
D_y = np.array(D_demand["sales_volume_location_2017"])


# Least squares model for 'L' component of objective functon

L_demand = demand[(demand['sales_volume_location_2017'].notna()) &
               (demand['neighborhood_property_crimes'].notna()) &
               (demand['neighborhood_avg_property_value'].notna()) &
               (demand['surrounding_neighborhood_avg_property_value'].notna()) &
               (demand['surrounding_neighborhood_property_crimes'].notna())]

L_demand_features = ["neighborhood_avg_property_value",
                     "neighborhood_property_crimes",
           "surrounding_neighborhood_avg_property_value","surrounding_neighborhood_property_crimes"]

L_X = np.array(L_demand[L_demand_features])
L_y = np.array(L_demand["sales_volume_location_2017"])




In [115]:
D_lr = LinearRegression()

D_model = D_lr.fit(D_X,D_y)

L_lr = LinearRegression()

L_model = L_lr.fit(L_X,L_y)


In [116]:
D_coef = list(D_model.coef_)
D_coef

[0.9317133788087207,
 -0.05389048407535676,
 -0.21656957369296354,
 0.031990880285334006,
 0.2202127995619555]

In [117]:
L_coef = list(L_model.coef_)
L_coef

[-0.2547677851422151,
 -0.5263550451979571,
 1.1614176431445236,
 -2.0225396857646305]

In [118]:
# save as txt file

D_model_params = [D_demand_features, L_coef]

with open('D_demand_coef.txt', 'w') as model_text:
    for listitem in D_model_params:
        model_text.write('%s\n' % listitem)
        
        # save as txt file

L_model_params = [L_demand_features, L_coef]

with open('L_demand_coef.txt', 'w') as model_text:
    for listitem in L_model_params:
        model_text.write('%s\n' % listitem)