# Prepare Phase: New Business Location Predictor

In [1]:
import numpy as np
import pandas as pd
import os

import acquire as a
from env import api

# imports to deal with location data in Python
import geopandas as gpd
from shapely.geometry import Point

# imports to interact with google places API
import requests
import json

In [2]:
sex = a.get_sex_age_data()
sex.head()

Unnamed: 0,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio
0,110100,3153,161.7,29.0,6.4
1,110300,3114,99.9,13.0,43.4
2,110500,2430,100.3,25.4,89.8
3,110600,5645,275.1,15.0,11.6
4,110700,1079,129.1,49.2,35.8


In [3]:
race = a.get_race_data()
race.head()

Unnamed: 0,geography,total_hispanic_latino
0,110100,1758
1,110300,1589
2,110500,1982
3,110600,4763
4,110700,933


In [4]:
income = a.get_income_data()
income.head()

Unnamed: 0,geography,household_med_income
0,110100,52659
1,110300,43875
2,110500,10518
3,110600,16712
4,110700,18700


In [5]:
# now merge this all into one df:
df = pd.merge(pd.merge(sex, race, on='geography'), income, on='geography')
df.head()

Unnamed: 0,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio,total_hispanic_latino,household_med_income
0,110100,3153,161.7,29.0,6.4,1758,52659
1,110300,3114,99.9,13.0,43.4,1589,43875
2,110500,2430,100.3,25.4,89.8,1982,10518
3,110600,5645,275.1,15.0,11.6,4763,16712
4,110700,1079,129.1,49.2,35.8,933,18700


In [6]:
# testing the get_census_data function:
census = a.get_census_data()
census.head()

Unnamed: 0,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio,total_hispanic_latino,household_med_income,centroid_lat,centroid_long
0,110100,3153,161.7,29.0,6.4,1758,52659,29.4261655,-98.4905264
1,110300,3114,99.9,13.0,43.4,1589,43875,29.4143199,-98.4808476
2,110500,2430,100.3,25.4,89.8,1982,10518,29.417071,-98.5090798
3,110600,5645,275.1,15.0,11.6,4763,16712,29.429211,-98.5078374
4,110700,1079,129.1,49.2,35.8,933,18700,29.4389386,-98.5036913


In [7]:
# check some basic stats on this df
census.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375 entries, 0 to 374
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   geography              375 non-null    object
 1   total_pop              375 non-null    int64 
 2   sex_ratio              375 non-null    object
 3   old_age_dep_ratio      375 non-null    object
 4   child_dep_ratio        375 non-null    object
 5   total_hispanic_latino  375 non-null    int64 
 6   household_med_income   375 non-null    object
 7   centroid_lat           375 non-null    object
 8   centroid_long          375 non-null    object
dtypes: int64(2), object(7)
memory usage: 29.3+ KB


In [8]:
# there are some columns that are wrong dtypes
# sex_ratio, old_age_ratio, child_dep_ratio, 
# centroid_lat and centroid_long should be floats
# household_med_income should be int
census[census.sex_ratio == '-']


Unnamed: 0,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio,total_hispanic_latino,household_med_income,centroid_lat,centroid_long
369,980001,0,-,-,-,2,-,29.6858707,-98.5638545
370,980002,0,-,-,-,0,-,29.3764255,-98.5460439
372,980004,0,-,-,-,1,-,29.5369298,-98.4680109
373,980005,0,-,-,-,32,-,29.5782334,-98.7484151


In [9]:
# It appears that four of the geographies associated with military have 
# populations of zero and/or incomplete data.
# Checking the other 98 tracts
census[census.geography.str.startswith('98')]

Unnamed: 0,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio,total_hispanic_latino,household_med_income,centroid_lat,centroid_long
369,980001,0,-,-,-,2,-,29.6858707,-98.5638545
370,980002,0,-,-,-,0,-,29.3764255,-98.5460439
371,980003,1059,227.9,0.8,27.7,197,60781,29.3724905,-98.666017
372,980004,0,-,-,-,1,-,29.5369298,-98.4680109
373,980005,0,-,-,-,32,-,29.5782334,-98.7484151
374,980100,943,125.1,7.1,45.5,700,36667,29.3897891,-98.582398


In [10]:
# it appears that the other two military geographies have complete data.
# quick check if there are other tracts with 0 total_pop
census[census.total_pop == 0]

Unnamed: 0,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio,total_hispanic_latino,household_med_income,centroid_lat,centroid_long
369,980001,0,-,-,-,2,-,29.6858707,-98.5638545
370,980002,0,-,-,-,0,-,29.3764255,-98.5460439
372,980004,0,-,-,-,1,-,29.5369298,-98.4680109
373,980005,0,-,-,-,32,-,29.5782334,-98.7484151


In [11]:
# just those four so will drop as it is a small number and very difficult
# to impute values for them with any certainty
census = census[census.total_pop != 0].reset_index()
census.shape

(371, 10)

In [12]:
# okay, try to cast dtypes again:
cols = census.columns.tolist()
cols

['index',
 'geography',
 'total_pop',
 'sex_ratio',
 'old_age_dep_ratio',
 'child_dep_ratio',
 'total_hispanic_latino',
 'household_med_income',
 'centroid_lat',
 'centroid_long']

In [13]:
census = census.astype({'sex_ratio':'float','old_age_dep_ratio':'float',
               'child_dep_ratio':'float','household_med_income':'int',
               'centroid_lat':'float', 'centroid_long':'float'})

In [14]:
census.head()

Unnamed: 0,index,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio,total_hispanic_latino,household_med_income,centroid_lat,centroid_long
0,0,110100,3153,161.7,29.0,6.4,1758,52659,29.426165,-98.490526
1,1,110300,3114,99.9,13.0,43.4,1589,43875,29.41432,-98.480848
2,2,110500,2430,100.3,25.4,89.8,1982,10518,29.417071,-98.50908
3,3,110600,5645,275.1,15.0,11.6,4763,16712,29.429211,-98.507837
4,4,110700,1079,129.1,49.2,35.8,933,18700,29.438939,-98.503691


In [15]:
census.dtypes

index                      int64
geography                 object
total_pop                  int64
sex_ratio                float64
old_age_dep_ratio        float64
child_dep_ratio          float64
total_hispanic_latino      int64
household_med_income       int64
centroid_lat             float64
centroid_long            float64
dtype: object

In [16]:
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  371 non-null    int64  
 1   geography              371 non-null    object 
 2   total_pop              371 non-null    int64  
 3   sex_ratio              371 non-null    float64
 4   old_age_dep_ratio      371 non-null    float64
 5   child_dep_ratio        371 non-null    float64
 6   total_hispanic_latino  371 non-null    int64  
 7   household_med_income   371 non-null    int64  
 8   centroid_lat           371 non-null    float64
 9   centroid_long          371 non-null    float64
dtypes: float64(5), int64(4), object(1)
memory usage: 29.1+ KB


In [17]:
# finally looks clean and ready to add search data to.
# time to incorporate this into a function

In [18]:
yoga_coords = a.get_bexar_yoga_studios(api)

In [19]:
yoga_coords

Unnamed: 0,latitude,longitude,geography
0,29.479815,-98.492192,190800
1,29.466683,-98.490809,190400
2,29.614283,-98.312855,310801
3,29.485545,-98.569729,180604
4,29.727999,-98.094677,310503
5,29.610592,-98.491604,191818
6,29.792908,-98.729822,970500
7,29.534977,-98.521877,191101
8,29.521189,-98.709029,181726
9,29.431522,-98.482394,111100


In [20]:
# l.assign_census_tract(yoga_coords)

In [21]:
yoga_coords.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   20 non-null     float64
 1   longitude  20 non-null     float64
 2   geography  20 non-null     object 
dtypes: float64(2), object(1)
memory usage: 640.0+ bytes


In [22]:
# yoga_coords = yoga_coords[yoga_coords.census_tract != None]

In [23]:
# yoga_coords['geography'] = ((yoga_coords.census_tract.astype(
    # 'float')) * 100).astype('int').astype('str')

In [24]:
yoga_coords

Unnamed: 0,latitude,longitude,geography
0,29.479815,-98.492192,190800
1,29.466683,-98.490809,190400
2,29.614283,-98.312855,310801
3,29.485545,-98.569729,180604
4,29.727999,-98.094677,310503
5,29.610592,-98.491604,191818
6,29.792908,-98.729822,970500
7,29.534977,-98.521877,191101
8,29.521189,-98.709029,181726
9,29.431522,-98.482394,111100


In [25]:
combined = pd.merge(census, yoga_coords, on = 'geography', how= 'left')
combined

Unnamed: 0,index,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio,total_hispanic_latino,household_med_income,centroid_lat,centroid_long,latitude,longitude
0,0,110100,3153,161.7,29.0,6.4,1758,52659,29.426165,-98.490526,,
1,1,110300,3114,99.9,13.0,43.4,1589,43875,29.414320,-98.480848,,
2,2,110500,2430,100.3,25.4,89.8,1982,10518,29.417071,-98.509080,,
3,3,110600,5645,275.1,15.0,11.6,4763,16712,29.429211,-98.507837,,
4,4,110700,1079,129.1,49.2,35.8,933,18700,29.438939,-98.503691,,
...,...,...,...,...,...,...,...,...,...,...,...,...
367,366,192100,1948,86.2,27.1,12.9,1361,75843,29.408368,-98.493617,,
368,367,192200,2922,123.7,7.3,32.8,2247,45691,29.347876,-98.464812,,
369,368,192300,5500,93.4,30.1,27.3,1923,83750,29.566457,-98.501561,,
370,371,980003,1059,227.9,0.8,27.7,197,60781,29.372491,-98.666017,,


In [26]:
combined2 = pd.merge(df, yoga_coords, on = 'geography', how= 'outer')
combined2

Unnamed: 0,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio,total_hispanic_latino,household_med_income,latitude,longitude
0,110100,3153.0,161.7,29.0,6.4,1758.0,52659,,
1,110300,3114.0,99.9,13.0,43.4,1589.0,43875,,
2,110500,2430.0,100.3,25.4,89.8,1982.0,10518,,
3,110600,5645.0,275.1,15.0,11.6,4763.0,16712,,
4,110700,1079.0,129.1,49.2,35.8,933.0,18700,,
...,...,...,...,...,...,...,...,...,...
377,310503,,,,,,,29.727999,-98.094677
378,970500,,,,,,,29.792908,-98.729822
379,970302,,,,,,,29.796768,-98.754554
380,310502,,,,,,,29.716761,-98.072319


In [27]:
combined['has_yoga'] = combined.latitude >= 0

In [28]:
combined

Unnamed: 0,index,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio,total_hispanic_latino,household_med_income,centroid_lat,centroid_long,latitude,longitude,has_yoga
0,0,110100,3153,161.7,29.0,6.4,1758,52659,29.426165,-98.490526,,,False
1,1,110300,3114,99.9,13.0,43.4,1589,43875,29.414320,-98.480848,,,False
2,2,110500,2430,100.3,25.4,89.8,1982,10518,29.417071,-98.509080,,,False
3,3,110600,5645,275.1,15.0,11.6,4763,16712,29.429211,-98.507837,,,False
4,4,110700,1079,129.1,49.2,35.8,933,18700,29.438939,-98.503691,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,366,192100,1948,86.2,27.1,12.9,1361,75843,29.408368,-98.493617,,,False
368,367,192200,2922,123.7,7.3,32.8,2247,45691,29.347876,-98.464812,,,False
369,368,192300,5500,93.4,30.1,27.3,1923,83750,29.566457,-98.501561,,,False
370,371,980003,1059,227.9,0.8,27.7,197,60781,29.372491,-98.666017,,,False


In [29]:
combined[combined.has_yoga == True]

Unnamed: 0,index,geography,total_pop,sex_ratio,old_age_dep_ratio,child_dep_ratio,total_hispanic_latino,household_med_income,centroid_lat,centroid_long,latitude,longitude,has_yoga
6,6,111100,3712,119.3,23.4,17.8,1736,55179,29.440625,-98.488024,29.431522,-98.482394,True
17,17,120701,6597,84.3,20.2,38.1,3044,56332,29.512869,-98.469458,29.509964,-98.469,True
37,37,121204,7878,105.3,26.7,27.0,3909,38662,29.529445,-98.420127,29.520309,-98.431504,True
240,240,180604,6523,119.9,21.2,30.1,4306,47372,29.481271,-98.570633,29.485545,-98.569729,True
241,241,180701,4171,98.2,13.8,16.5,2095,39240,29.500747,-98.570023,29.501858,-98.575409,True
279,279,181726,11131,96.9,7.4,58.8,7405,104746,29.531274,-98.694561,29.521189,-98.709029,True
295,295,181819,7680,97.0,6.2,29.5,3329,62560,29.565075,-98.612493,29.564069,-98.600784,True
305,305,182001,5600,103.0,13.4,46.1,2407,105404,29.566147,-98.662425,29.563809,-98.66156,True
315,315,190400,4270,94.6,16.3,8.2,1149,84821,29.463074,-98.489448,29.466683,-98.490809,True
323,323,190800,1856,77.1,28.7,30.4,470,141750,29.474926,-98.486631,29.479815,-98.492192,True


In [30]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372 entries, 0 to 371
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  372 non-null    int64  
 1   geography              372 non-null    object 
 2   total_pop              372 non-null    int64  
 3   sex_ratio              372 non-null    float64
 4   old_age_dep_ratio      372 non-null    float64
 5   child_dep_ratio        372 non-null    float64
 6   total_hispanic_latino  372 non-null    int64  
 7   household_med_income   372 non-null    int64  
 8   centroid_lat           372 non-null    float64
 9   centroid_long          372 non-null    float64
 10  latitude               14 non-null     float64
 11  longitude              14 non-null     float64
 12  has_yoga               372 non-null    bool   
dtypes: bool(1), float64(7), int64(4), object(1)
memory usage: 38.1+ KB


In [31]:
census.shape

(371, 10)