In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('whitegrid')

In this notebook, we outline the process of gathering census data for the training set. It is hoped that these additional features will provide greater accuracy in our model predictions.

In [5]:

rd_train = pd.read_csv("/Users/dominiquekemp/Documents/GitHub/'Will It Restaurant?'/train_data.csv")



In [26]:
with open("/Users/dominiquekemp/Documents/GitHub/'Will It Restaurant?'/census_APIkey.csv", "r") as file:
    CENSUS_API_KEY = file.read()

In [6]:
import census
from census import Census
c = Census(CENSUS_API_KEY)
import requests

### Gathering of Census Tracts

In [7]:
#a function for obtaining the census tract of a location
def get_census_tract(lat, lon):
    url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates"
    params = {
        "x": lon,
        "y": lat,
        "benchmark": "Public_AR_Current",
        "vintage": "Current_Current",
        "format": "json"
    }
    response = requests.get(url, params=params)
    data = response.json()
    
    try:
        tract_info = data['result']['geographies']['Census Tracts'][0]
        return {
            "tract": tract_info['TRACT'],
            "county": tract_info['COUNTY'],
            "state": tract_info['STATE'],
            "geoid": tract_info['GEOID']
        }
    except Exception as e:
        return None

In [8]:
all_tracts = []
for i in range(len(rd_train)):
    lat, lon = rd_train[['latitude', 'longitude']].values[i]
    all_tracts.append(get_census_tract(lat, lon))
    

In [9]:
#consolidate identical tracts
unique_tracts = []
tract_nos = []
for tract in all_tracts:
    tract_num = tract['tract']
    if tract_num not in tract_nos:
        unique_tracts.append(tract)
        tract_nos.append(tract['tract'])

In [10]:
counties = []
tracts = []
for tract in unique_tracts:
    counties.append(tract['county'])
    tracts.append(tract['tract'])

In [11]:
#write all of the tract information to a file
with open("/Users/dominiquekemp/Documents/GitHub/'Will It Restaurant?'/census_tracts.csv", 'w+') as file:
    for item in unique_tracts:
        file.write(str(item) + "\n")

### Gathering of ACS Data for 2021

In [12]:
#define a function for producing the desired census data 
def get_acs_data_for_county_tracts(census_obj, county_fips, tract_codes, variables, year=2021):
    """
    Retrieves ACS5 data for a list of county tracts and specified variables.

    Args:
        census_obj: A Census object instantiated with your API key.
        county_fips: A list of county FIPS codes.
        tract_codes: A list of tract codes.
        variables: A list or tuple of ACS variables to retrieve.
        year: The year of the ACS data (optional, defaults to latest).

    Returns:
        A pandas DataFrame containing the requested data.
    """

    all_data = []
    for i in range(len(tract_codes)):
        geo_filter = {'for': f'tract:{tract_codes[i]}', 'in': f'state:{'42'} county:{county_fips[i]}'}
             
        data = census_obj.acs5.get(variables, geo_filter, year=year)
            
        all_data.extend(data)

    return pd.DataFrame(all_data)

The census variables are defined as below:





Median Household Income (B19013_001E)

Poverty Rate (% Below poverty level) (B17001_002E / B17001_001E)

Education level (% with Bachelor's Degree or Higher) (B15003_022E/B15003_001E)

Race/Ethnicity data:

Total Population B02001_001E

pct White B02001_002E / tot pop

pct Black B02001_003E / tot pop

pct Asian B02001_005E / tot pop

pct Hispanic/Latino B03003_003E / tot pop (ethnicity tracked differently)

Immigration data: % Foreign Born Immigrant B05002_013E / B05002_001E

Age data: Median Age B01002_001E 

Neighborhood turnover: Neighborhood turnover B07003_017E / B07003_001E

In [13]:
variables_to_get = ("B19013_001E", 'B17001_002E', 'B17001_001E', 'B15003_001E', 'B15003_022E', 'B02001_001E', 
                                       'B02001_002E', 'B02001_003E', 'B02001_005E', 'B03003_003E', 'B05002_013E', 
                                       'B05002_001E', 'B01002_001E', 'B07003_017E', 'B07003_001E')

acs_data = get_acs_data_for_county_tracts(c, counties, tracts, variables_to_get)


Let us download this dataframe in order to view its initial form.

In [14]:
acs_data.to_csv("/Users/dominiquekemp/Documents/GitHub/'Will It Restaurant?'/census_data.csv", index = False)

### Data Cleaning

In [16]:
#clean the dataframe of eccentric values
acs_data.replace(-666666666.0, np.nan, inplace=True)

i=0
while i < len(acs_data.index):
    boolean_for_zero_values = (acs_data.iloc[i, list(range(1,12)) + [13,14]] == 0)
    if boolean_for_zero_values.all():
        acs_data.drop(acs_data.index[i], inplace = True)
    else:
        i += 1

In [17]:
#reorganize the census dataframe with relabeled columns and desired percentages for various categories
census_dict = {"B19013_001E":'Median Household Income', "B17001_002E": "Poverty Rate Num", "B17001_001E": "Poverty Rate Denom", "B15003_022E": 'Education Level Num', "B15003_001E": "Education Level Denom", "B02001_001E": "Total Population",
                 "B02001_002E": "% White", "B02001_003E": "% Black", "B02001_005E": "% Asian", "B03003_003E": "% Hispanic/Latino", "B05002_013E": "Foreign Born Immigrant Num", 
                "B05002_001E": "Foreign Born Immigrant Denom", "B01002_001E": "Median Age", "B07003_017E": "Neighborhood Turnover Num", "B07003_001E": "Neighborhood Turnover Denom"}

acs_data.rename(columns = census_dict, inplace = True)

perc_list = [['Poverty Rate Num', 'Poverty Rate Denom'], ['Education Level Num', 'Education Level Denom'], ['% White', 'Total Population'],
             ['% Black', 'Total Population'], ['% Asian', 'Total Population'], ['% Hispanic/Latino', 'Total Population'], ['Foreign Born Immigrant Num', 'Foreign Born Immigrant Denom'],
             ['Neighborhood Turnover Num', 'Neighborhood Turnover Denom']]

for ratio in perc_list:
    acs_data[ratio[0]] = acs_data[ratio[0]]/acs_data[ratio[1]]
    if ratio[1] != 'Total Population':
        acs_data.drop(ratio[1], axis = 1, inplace = True)
    

    
 

In [18]:
acs_data.rename(columns = {'Poverty Rate Num': "Poverty Rate %", 'Education Level Num': 'Education Level %', 'Foreign Born Immigrant Num': 'Foreign Born Immigrant %', 'Neighborhood Turnover Num': 'Neighborhood Turnover %'}, inplace = True)

In [19]:
acs_data

Unnamed: 0,Median Household Income,Poverty Rate %,Education Level %,Total Population,% White,% Black,% Asian,% Hispanic/Latino,Foreign Born Immigrant %,Median Age,Neighborhood Turnover %,state,county,tract
0,34928.0,0.166205,0.070009,3620.0,0.008011,0.952486,0.000000,0.024862,0.107459,36.3,0.000000,42,101,007000
1,117904.0,0.178880,0.390262,5090.0,0.651473,0.096267,0.068566,0.246955,0.109430,31.5,0.003249,42,101,014400
2,43438.0,0.299709,0.110174,2746.0,0.051712,0.571741,0.250910,0.153314,0.322651,46.9,0.000000,42,101,027401
3,14962.0,0.696408,0.238164,2771.0,0.360881,0.452905,0.066041,0.093107,0.075424,22.0,0.000000,42,101,014700
4,99125.0,0.044559,0.301229,3299.0,0.758715,0.148530,0.049409,0.013640,0.064262,34.9,0.002475,42,101,001800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389,67083.0,0.035491,0.237960,1437.0,0.862909,0.069589,0.004871,0.038274,0.019485,40.4,0.000000,42,101,021900
390,42038.0,0.278310,0.075342,5523.0,0.012131,0.950570,0.000000,0.003440,0.147384,30.2,0.000000,42,101,026500
391,37521.0,0.219468,0.104737,2784.0,0.060345,0.875359,0.057830,0.000359,0.276221,40.2,0.000000,42,101,006100
392,23931.0,0.570098,0.052780,3870.0,0.280879,0.045995,0.006718,0.840568,0.116021,27.8,0.000000,42,101,019502


### File Uploading

In [20]:
acs_data.sort_values(by = "tract", inplace = True)

In [21]:
acs_data.to_csv("/Users/dominiquekemp/Documents/GitHub/'Will It Restaurant?'/census_data.csv", index = False)

In [22]:
#append column of census tract numbers to restaurant dataframe
rd_train["census_tract"] = [tract['tract'] for tract in all_tracts]

In [23]:
rd_train.to_csv("/Users/dominiquekemp/Documents/GitHub/'Will It Restaurant?'/train_data_with_census.csv", index = False)

In [24]:
rd_train = rd_train[rd_train.range == '1']

rd_train.to_csv("/Users/dominiquekemp/Documents/GitHub/'Will It Restaurant?'/level1_data_w_censustract.csv", index = False)