# Needed libraries

In [1]:
import censusgeocode as cg
import pandas as pd
import os
import numpy as np

## Using census geocode

Use the census geocode library to find census tracts.  You need either a street, city, and state, or a street, zipcode and state.  The return value is a list of dictionaries.  There's lots of data in the response, but we're only really interested in the census tract information.  First two digits are the state code.  Next three are the county code.  Next six are local tract identification.  The last digit is the "block" within the local tracts, and I forget what the remaining digits represent.  We don't need them for finding the tract data, so they'll eventually be discarded.

#### Example 1

In [2]:
street = "360 Beacon Street"
city = "Somerville"
state = "MA"

# cg.address(street = "",city = , state = "", zipcode = "")
output1 = cg.address(street, city, state)

In [3]:
output1[0]["geographies"]["2010 Census Blocks"][0]["GEOID"]

'250173510003004'

#### Example 2

In [4]:
street = "338 Beacon Street"
city = "Somerville"
state = "MA"

# cg.address(street = "",city = , state = "", zipcode = "")
output2 = cg.address(street, city, state)

In [5]:
output2[0]["geographies"]["2010 Census Blocks"][0]["GEOID"]

'250173510003003'

## Location probabilities

## Function

Since the output type of the geocoder contains several things, we'll use the following function to extract the census tract data.  You need to feed in a street and a city, or a street and a zipcode.  The preference is for zipcode, but city is fine.

In [7]:
def extract_address_range(geocode_data):
    """This is a helper function to extract the address ranges returned for a
    geocode query.  The intent is to store the data to obviate/simplify
    future searches."""
    
    # Getting the min and max address numbers for the block found
    fromaddress = geocode_data[0]["addressComponents"]["fromAddress"]
    toaddress = geocode_data[0]["addressComponents"]["toAddress"]
    
    toaddress = geocode_data[0]["addressComponents"]["toAddress"]
    
    # completing the city, zipcode, and state data in case it was missing
    city = geocode_data[0]["addressComponents"]["city"]
    zipcode = geocode_data[0]["addressComponents"]["zip"]
    state = geocode_data[0]["addressComponents"]["state"]

    return [fromaddress, toaddress, city, zipcode, state]

def extract_block(street, city = None, zipcode = None, state = "MA",fail = True):
    """This function extracts the block number for an address after passing it
    to the census geocode function for identification.  It returns a string.  If
    the address can't be found or is otherwise dysfunctional, the function can be
    toggled to fail, or to return even probabilities (representing no information)."""
    
    try:
        # if the user gives the zipcode format
        if street and zipcode:
            # acquire the geocode response
            output = cg.address(street = street, zipcode = zipcode, state = state)

            # acquire the block number from the response
            block_data =  output[0]["geographies"]["2010 Census Blocks"][0]["GEOID"]

            # turn the block into the format needed for the location data lookup
            block = block_data[:11] + block_data[-1]

            # acquire the complete address data
            address_data = extract_address_range(output)

            return block, address_data

        # if the user gives the city format
        elif street and city:
            # acquire the geocode response
            output = cg.address(street = street, city = city, state = state)

            # acquire the block number from the response
            block_data =  output[0]["geographies"]["2010 Census Blocks"][0]["GEOID"]

            # turn the block into the format needed for the location data lookup
            block = block_data[:11] + block_data[-1]

            # acquire the complete address data
            address_data = extract_address_range(output)

            return block, address_data

    except:
        pass
    
    # if the user doesn't provide enough information, or the address couldn't be found
    # either give a failure value, None, or even probabilities if the data is cleaned
    # properly and you assume the address just couldn't be found.
    if fail == True:
        return None, None
    else:
        return np.ones(shape = (1,6))/6, None

In [8]:
num, data = extract_block(street = "350 Beacon Street", zipcode = "02143")
num, data

('250173510004', ['340', '368', 'SOMERVILLE', '02143', 'MA'])

In [9]:
# should be used to compile a database to obviate censusgeocode
data

['340', '368', 'SOMERVILLE', '02143', 'MA']

In [90]:
# the file with the extra dec10 listing has absolute counts.
loc_data = pd.read_stata("blkgrp_over18_race_dec10.dta")

In [91]:
loc_data.head()

Unnamed: 0,GEOID10_BlkGrp,State_FIPS10,County_FIPS10,Tract_FIPS10,BlkGrp_FIPS10,Total_Pop,Hispanic_Total,Non_Hispanic_Total,NH_White_alone,NH_Black_alone,NH_AIAN_alone,NH_API_alone,NH_Other_alone,NH_Mult_Total,NH_White_Other,NH_Black_Other,NH_AIAN_Other,NH_Asian_HPI,NH_API_Other,NH_Asian_HPI_Other
0,10010201001,1,1,20100,1,523,13,510,441,55,4,3,0,7,0,0,0,0,0,0
1,10010201002,1,1,20100,2,882,15,867,759,89,7,6,0,6,0,0,0,0,0,0
2,10010202001,1,1,20200,1,664,23,641,218,413,1,4,1,4,0,0,0,0,0,0
3,10010202002,1,1,20200,2,900,38,862,414,442,1,1,2,2,0,0,0,0,0,0
4,10010203001,1,1,20300,1,1859,42,1817,1460,314,5,9,0,29,1,0,0,2,0,0


In [44]:
string = '250173510003004'
loc_data.query('State_FIPS10 == "25" & Tract_FIPS10 == "351000"')

Unnamed: 0,GEOID10_BlkGrp,State_FIPS10,County_FIPS10,Tract_FIPS10,BlkGrp_FIPS10,Total_Pop,Hispanic_Total,Non_Hispanic_Total,NH_White_alone,NH_Black_alone,NH_AIAN_alone,NH_API_alone,NH_Other_alone,NH_Mult_Total,NH_White_Other,NH_Black_Other,NH_AIAN_Other,NH_Asian_HPI,NH_API_Other,NH_Asian_HPI_Other
93108,250173510001,25,17,351000,1,1151,61,1090,946,43,0,72,10,19,2,0,0,0,1,0
93109,250173510002,25,17,351000,2,595,27,568,510,6,0,44,0,8,0,0,0,0,0,0
93110,250173510003,25,17,351000,3,664,33,631,472,24,0,125,0,10,1,0,0,0,2,0
93111,250173510004,25,17,351000,4,1728,84,1644,1378,33,0,189,3,41,0,0,0,0,2,0
93112,250173510005,25,17,351000,5,957,40,917,755,36,7,101,8,10,0,0,0,0,0,0
93113,250173510006,25,17,351000,6,947,35,912,794,12,0,94,1,11,2,0,0,0,0,0


# Function

In [126]:
def loc_lookup(tract_id):
    row =  loc_data.query("GEOID10_BlkGrp == '{}'".format(tract_id))
    return row.iloc[0,[8,9,11,10,13,6]]

In [127]:
loc_lookup(num)

NH_White_alone    1378
NH_Black_alone      33
NH_API_alone       189
NH_AIAN_alone        0
NH_Mult_Total       41
Hispanic_Total      84
Name: 93111, dtype: object

In [128]:
output.values

array([1378, 33, 0, 189, 3, 41, 84], dtype=object)

# Function

In [114]:
def prob_convert(nums,rental = False):
    data = np.array(nums)
    total = np.sum(nums)
    
    # implement rental adjustment in the future
    
    return data / total

In [115]:
prob_convert(output.values)

array([0.7974537037037037, 0.019097222222222224, 0.0, 0.109375,
       0.001736111111111111, 0.023726851851851853, 0.04861111111111111],
      dtype=object)

# Surname probabilities

In [None]:
sur_import = pd.read_csv("Names_2010Census.csv")

The data has strings "(S)" we need to get rid of.  We use the following function to iterate over the data frame and 

In [174]:
# function to convert needed values to floats in our imported data
def convert(value):
    try:
        val = float(value)
    except:
        if value == "(S)":
            val = 0
        else:
            val = value
    return val

sur_data = sur_import.applymap(convert)

In [175]:
sur_data.head()

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1.0,2442977.0,828.19,828.19,70.9,23.11,0.5,0.89,2.19,2.4
1,JOHNSON,2.0,1932812.0,655.24,1483.42,58.97,34.63,0.54,0.94,2.56,2.36
2,WILLIAMS,3.0,1625252.0,550.97,2034.39,45.75,47.68,0.46,0.82,2.81,2.49
3,BROWN,4.0,1437026.0,487.16,2521.56,57.95,35.6,0.51,0.87,2.55,2.52
4,JONES,5.0,1425470.0,483.24,3004.8,55.19,38.48,0.44,1.0,2.61,2.29


In [176]:
def sur_lookup(name):
    data = sur_data.query("name == '{}'".format(name.upper()))
    nums = data.iloc[0,[5,6,7,8,9,10]].values
    return nums

In [177]:
sur_lookup("william")

array([35.2, 53.62, 4.2, 1.05, 2.38, 3.56], dtype=object)

In [178]:
def bisg(name, street, city = None, zipcode = None, state = "MA"):
    
    # find the tract, extract the ethnic counts, and turn them into location probabilities
    tract, _ = extract_block(street, city, zipcode, state)
    loc_nums = loc_lookup(tract)
    loc_probs = prob_convert(loc_nums)
    
    # find the sur_name counts, and turn them into surname probabilities
    sur_nums = sur_lookup(name)
    print(sur_nums)
    sur_probs = prob_convert(sur_nums)
    
    # combine the probabilities, and scale them to have probability 1
    joint = sur_probs * loc_probs
    marginal = np.sum(joint)
    post = joint / marginal
    
    # probabilities for white, black, API, AIAN, Multi-racial, and hispanic, in that order.
    bisg_probs = post
    
    return bisg_probs

In [179]:
bisg(name = "palin", street = "350 Beacon Street", city = "Somerville")

[77.84 12.25 0.78 1.34 1.67 6.12]


array([0.9895364977166499, 0.003729321293967937, 0.0013599914536963596,
       0.0, 0.000631655235616536, 0.004742534300069356], dtype=object)