In [235]:
import pandas as pd
import numpy as np
import json

from py2neo import Graph

from sklearn.linear_model import LinearRegression

import utilities
from demand_models import filter_business_data

ModuleNotFoundError: No module named 'demand_models'

In [228]:
df_types = pd.read_csv('../data/dtypes.csv')['dtypes']
bus = pd.read_csv("../data/chi_bus_cleaned.csv",dtype=df_types.to_dict())

In [229]:
def parse_naics(df_value, naics):
    
    """
    filter provided dataframe for naics codes. 
    mean to be used in df.apply() 
    """

    results = []
    for i in naics:
        
        naics_length = len(i)
        truncated_naics = df_value[:naics_length]
        if truncated_naics == i:
            results.append(True)
        else:
            results.append(False)
            
    return any(results)
    
    

In [230]:
def business_search(years, naics_codes):
    
    """
    :years: list of integer years you would like to select out
    :naics_codes: list of string naics codes you would like to select out,
                will match only up to the length of the code provided
    Returns: filtered dataframe of business data 
    """
    
    assert(isinstance(years,list)), "\
        years argument must be of type list"
    
    assert(all(element for element in [isinstance(i,int) for i in years])), "\
        all years must be of type int"
    
    assert(isinstance(naics_codes,list)), "\
        naics_code argument must be of type list"
    
    assert(all(element for element in [isinstance(i,str) for i in naics_codes])), "\
        all naics_codes must be of type str"
    
    df_types = pd.read_csv('../data/dtypes.csv')['dtypes']
    bus = pd.read_csv("../data/chi_bus_cleaned.csv",dtype=df_types.to_dict())
    
    bus = bus[bus['primary_naics_code'].apply(parse_naics, args=[naics_codes])]
    
    return bus[bus['year'].isin(years)]
    
    
    

In [233]:
bus = business_search([2013,2015,2016], ['445110','335'])

In [234]:
# Understanding year distribution 

bus['year'].value_counts()

2013    1518
2015    1203
2016    1034
Name: year, dtype: int64

In [15]:
# Filtering for 2016 and 2017

groc_2016 = groc[groc['year']==2016]
groc_2017 = groc[groc['year']==2017]

# Concatenating 2016 and 2017 samples

demand = pd.merge(left=groc_2017, right=groc_2016, left_on='abi', right_on='abi')
demand.shape

(767, 23)

In [16]:
# Removing additional less-useful features

demand.drop(['primary_naics_code_y','company_y','year_y','business_status_code_y',
            'company_holding_status_y','census_tract_y','year_established_y',
            'company_y','year_y','business_status_code_y',
            'latitude_y','longitude_y'], inplace=True, axis=1)

cols = [demand.columns[i].replace("_x","_2017").replace("_y","_2016") for i in range(len(demand.columns))]
replacing = {i:j for (i,j) in zip(demand.columns,cols)}
demand.rename(columns=replacing, inplace=True)

In [17]:
# Connect to graph db

uri = "bolt://localhost:7687"
graph = Graph(uri, auth=("neo4j", "password"))



In [18]:
# Pulling out coordinates for each store

coordinates = list(zip(demand["longitude_2017"],demand["latitude_2017"]))

# Pulling neighborhood polygons
with open('dicts/neighborhood_polys.json','r') as f:
    neighborhoods = json.load(f)

In [19]:
# Loading average property value for the neighborhood in which the stores is located

demand['neighborhood_avg_property_value'] = np.nan

outside_city = []

for i in range(len(coordinates)):

    point_district = utilities.point_lookup(neighborhoods,coordinates[i])
    
    try:
        result = float(dict(pd.DataFrame(graph.run('match (a:neighborhood) where a.name = "{}" return a'.format(point_district)). \
                            to_table()).iloc[0,0])['avg_property_value'])
        ## coordinates and df indices should be the same ## 
        demand['neighborhood_avg_property_value'].iloc[i] = result
        
    except:
        outside_city.append((i, coordinates[i]))
     
    
    
for i in range(len(outside_city)): # for the few coordinates that lie just outside the city
    
    point_district = utilities.closest_to(neighborhoods,outside_city[i][1])
    
    result = float(dict(pd.DataFrame(graph.run('match (a:neighborhood) where a.name = "{}" return a'.format(point_district)). \
                            to_table()).iloc[0,0])['avg_property_value'])
    
    
    demand['neighborhood_avg_property_value'].iloc[outside_city[i][0]] = result
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [20]:
# Loading number of property crimes per neighborhood into demand model for each store

demand['neighborhood_property_crimes'] = np.nan

outside_city = []

for i in range(len(coordinates)):

    point_district = utilities.point_lookup(neighborhoods,coordinates[i])
    
    try:
        result = float(dict(pd.DataFrame(graph.run('match (a:neighborhood) where a.name = "{}" return a'.format(point_district)). \
                            to_table()).iloc[0,0])['n_property_crimes'])
        
        ## coordinates and df indices should be the same ## 
        demand['neighborhood_property_crimes'].iloc[i] = result
        
    except:
        outside_city.append((i, coordinates[i]))
     
    
for i in range(len(outside_city)): # for the few coordinates that lie just outside the city
    
    point_district = utilities.closest_to(neighborhoods,outside_city[i][1])
    
    result = float(dict(pd.DataFrame(graph.run('match (a:neighborhood) where a.name = "{}" return a'.format(point_district)). \
                            to_table()).iloc[0,0])['n_property_crimes'])
    
    
    demand['neighborhood_property_crimes'].iloc[outside_city[i][0]] = result
    

In [21]:
# Loading average property values of neighborhoods surrounding the neighborhood of the given store

demand['surrounding_neighborhood_avg_property_value'] = np.nan

outside_city = []

for i in range(len(coordinates)):

    point_district = utilities.point_lookup(neighborhoods,coordinates[i])
    
    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(point_district)). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['avg_property_value'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        
        ## coordinates and df indices should be the same ## 
        demand['surrounding_neighborhood_avg_property_value'].iloc[i] = surrounding_mean
            
    except:
        outside_city.append((i, coordinates[i]))
     
    
    
for i in range(len(outside_city)): # for the few coordinates that lie just outside the city
    
    point_district = utilities.closest_to(neighborhoods,outside_city[i][1])
    
    result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(point_district)). \
                            to_table())

    n_next_door = len(result[0])

    neighboring_means = []

    for i in range(n_next_door):
        neighboring_mean = float(dict(result[0][i])['avg_property_value'])
        neighboring_means.append(neighboring_mean)
        
    surrounding_mean = np.nanmean(neighboring_means)
    
    demand['surrounding_neighborhood_avg_property_value'].iloc[outside_city[i][0]] = surrounding_mean
    

In [22]:
# Loading number of property crimes of neighborhoods surrounding the neighborhood of the given store

demand['surrounding_neighborhood_property_crimes'] = np.nan

outside_city = []

for i in range(len(coordinates)):

    point_district = utilities.point_lookup(neighborhoods,coordinates[i])

    try:
        result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(point_district)). \
                            to_table())

        n_next_door = len(result[0])

        neighboring_means = []

        for j in range(n_next_door):
            neighboring_mean = float(dict(result[0][j])['n_property_crimes'])
            neighboring_means.append(neighboring_mean)
            
        surrounding_mean = np.nanmean(neighboring_means)
        
        ## coordinates and df indices should be the same ## 
        demand['surrounding_neighborhood_property_crimes'].iloc[i] = surrounding_mean
            
    except:
        outside_city.append((i, coordinates[i]))
        
    
for i in range(len(outside_city)): # for the few coordinates that lie just outside the city
    
    point_district = utilities.closest_to(neighborhoods,outside_city[i][1])
    
    result = pd.DataFrame(graph.run('match (a:neighborhood)-[:NEXT_TO]->(b) where a.name = "{}" return b'.format(point_district)). \
                            to_table())

    n_next_door = len(result[0])

    neighboring_means = []

    for i in range(n_next_door):
        neighboring_mean = float(dict(result[0][i])['n_property_crimes'])
        neighboring_means.append(neighboring_mean)
        
    surrounding_mean = np.nanmean(neighboring_means)
    
    demand['surrounding_neighborhood_property_crimes'].iloc[outside_city[i][0]] = surrounding_mean
    

In [23]:
# Least squares model for 'D' component of objective function:

D_demand = demand[(demand['sales_volume_location_2017'].notna()) &
               (demand['sales_volume_location_2016'].notna()) &
               (demand['neighborhood_property_crimes'].notna()) &
               (demand['neighborhood_avg_property_value'].notna()) &
               (demand['surrounding_neighborhood_avg_property_value'].notna()) &
               (demand['surrounding_neighborhood_property_crimes'].notna())]

D_demand_features = ["sales_volume_location_2016","neighborhood_avg_property_value",
                     "neighborhood_property_crimes",
           "surrounding_neighborhood_avg_property_value","surrounding_neighborhood_property_crimes"]

D_X = np.array(D_demand[D_demand_features])
D_y = np.array(D_demand["sales_volume_location_2017"])


# Least squares model for 'L' component of objective functon

L_demand = demand[(demand['sales_volume_location_2017'].notna()) &
               (demand['neighborhood_property_crimes'].notna()) &
               (demand['neighborhood_avg_property_value'].notna()) &
               (demand['surrounding_neighborhood_avg_property_value'].notna()) &
               (demand['surrounding_neighborhood_property_crimes'].notna())]

L_demand_features = ["neighborhood_avg_property_value",
                     "neighborhood_property_crimes",
           "surrounding_neighborhood_avg_property_value","surrounding_neighborhood_property_crimes"]

L_X = np.array(L_demand[L_demand_features])
L_y = np.array(L_demand["sales_volume_location_2017"])




In [24]:
D_lr = LinearRegression()

D_model = D_lr.fit(D_X,D_y)

L_lr = LinearRegression()

L_model = L_lr.fit(L_X,L_y)


In [25]:
D_coef = list(D_model.coef_)
D_coef

[0.9317133788087207,
 -0.05389048407535676,
 -0.21656957369296354,
 0.031990880285334006,
 0.2202127995619555]

In [26]:
L_coef = list(L_model.coef_)
L_coef

[-0.2547677851422151,
 -0.5263550451979571,
 1.1614176431445236,
 -2.0225396857646305]

In [27]:
# save as txt file

D_model_params = [D_demand_features, D_coef]

with open('opt_variables/D_demand_coef.txt', 'w') as model_text:
    for listitem in D_model_params:
        model_text.write('%s\n' % listitem)
        
        # save as txt file

L_model_params = [L_demand_features, L_coef]

with open('opt_variables/L_demand_coef.txt', 'w') as model_text:
    for listitem in L_model_params:
        model_text.write('%s\n' % listitem)