# Census Reporter API Query

In [1]:
import requests
import urllib
import json
import pandas as pd

def get_table_data(table_ids):
    api_url = 'https://api.censusreporter.org/1.0/data/show/latest?'
    params = {'table_ids':','.join(table_ids),
             'geo_ids':'16000US3651000,860|16000US3651000',
              'primary_geo_id':'16000US3651000'}
    params_enc = urllib.parse.urlencode(params)
    data = json.loads(requests.get(api_url + params_enc).text)
    return data

def get_table_as_json(table_ids):
    api_url = 'https://api.censusreporter.org/1.0/data/show/latest?'
    params = {'table_ids':','.join(table_ids),
             'geo_ids':'16000US3651000,860|16000US3651000',
              'primary_geo_id':'16000US3651000'}
    params_enc = urllib.parse.urlencode(params)
    data = requests.get(api_url + params_enc).text
    return data

In [15]:
d = get_table_data(['B01001'])

In [16]:
# Adapted from https://gist.github.com/JoeGermuska/1ed425c068d540326854
def prep_for_pandas(json_data,include_moe=False):
    """Given a dict of dicts as they come from a Census Reporter API call, set it up to be amenable to pandas.DataFrame.from_dict"""
    result = {}
    for geoid, tables in json_data.items():
        flat = {}
        for table, values in tables.items():
            for kind, columns in values.items():
                if kind == 'estimate':
                    flat.update(columns)
                elif kind == 'error' and include_moe:
                    renamed = dict((k+"_moe",v) for k,v in columns.items())
                    flat.update(renamed)
        result[geoid] = flat
    return result

def expand_column_names(col_dict):
    level_range = list(sorted(set(coldata['indent'] for colkey, coldata in col_dict.items())))
    max_level, min_level = max(level_range), min(level_range)
    curr_level = min(level_range)
    # loop through columns one at a time.
    # at each step, if we have increased the indent level,
    # add to the column prefix
    prefix = []
    out_names = []
    for colkey, coldata in sorted(col_dict.items()):
        clean_name = coldata['name'].strip(':')
        if coldata['indent'] == min_level:
            prefix = [clean_name]
            out_names.append(' '.join(prefix))
        elif coldata['indent'] > prev_level and coldata['indent'] != max_level:
            prefix.append(clean_name)
            out_names.append(' '.join(prefix))
        elif coldata['indent'] <= prev_level: # gone down a step
            prefix.pop() # remove the last item
            prefix.append(clean_name)
            out_names.append(' '.join(prefix))
        else:
            out_names.append(' '.join(prefix + [clean_name]))
        prev_level = coldata['indent']
    return out_names

In [17]:
# BUILD PANDAS DATAFRAME FROM CensusReporter TABLEID
def dataframe_from_json(table_name):
    d = get_table_data([table_name])
    df = pd.DataFrame.from_dict(prep_for_pandas(d['data']),orient='index')

    columns_to_names = dict(zip(sorted(d['tables'][table_name]['columns'].keys()), expand_column_names(d['tables'][table_name]['columns'])))
    new_columns = [columns_to_names[colname] for colname in df.columns]
    df.columns = new_columns
    new_index = [rowname.split('US')[-1] for rowname in df.index]
    df.index = new_index
    df.index.name='ZIP/Loc Code'
    return df

# EXAMPLE OF FUNCTION CALL => Insert specific Table ID (table_name) from CensusReporter
df_race = dataframe_from_json('B25006') # B01001
df_age  = dataframe_from_json('B01001') 
#df.columns = zip(sorted(d['tables']['B01001']['columns'].keys()), expand_column_names(d['tables']['B01001']['columns']))

In [19]:
df_age['Total 62 to 64 years']

Unnamed: 0_level_0,Total 62 to 64 years,Total 62 to 64 years
ZIP/Loc Code,Unnamed: 1_level_1,Unnamed: 2_level_1
3651000,141112.0,115401.0
07002,1052.0,1239.0
07008,263.0,283.0
07020,212.0,80.0
07024,687.0,599.0
07036,776.0,793.0
07064,36.0,38.0
07077,70.0,45.0
07202,472.0,271.0
07206,356.0,125.0


In [None]:
df_race.to_csv(path_or_buf="/Users/danny/Documents/courses/spring-2017/big-data/bigDataProject/data/census_race.csv")
df_age.to_csv(path_or_buf="/Users/danny/Documents/courses/spring-2017/big-data/bigDataProject/data/census_age.csv")

In [None]:
d['tables']['B01001']['columns']

def create_column_multiindex(col_dict):
    index_range = list(sorted(set(coldata['indent'] for colkey, coldata in col_dict.items())))
    levels = []
    for level in index_range:
        # get all the labels with the given index
        curr_level = []
        for colkey, coldata in col_dict.items():
            if coldata['indent'] == level:
                curr_level.append(coldata['name'])
        
        levels.append(list(set(curr_level)))
            
    return pd.MultiIndex.from_product(levels, names=[str(i) for i in range(len(levels))])

create_column_multiindex(d['tables']['B01001']['columns'])

In [None]:
columns_to_names = {col : d['tables']['B01001']['columns'][col]['name'] for col in d['tables']['B01001']['columns'].keys()}
for colkey in sorted(d['data']['86000US07036']['B01001']['estimate'].keys()):
    print columns_to_names[colkey], d['data']['86000US07036']['B01001']['estimate'][colkey]

In [None]:
d['data']['86000US07002']

In [None]:
# QUERY TABLE DATA & LOAD IT INTO A PANDAS DATAFRAME 
tbl_id = 'B01001'

df = pd.DataFrame(columns=create_column_multiindex(d['tables']['B01001']['columns']))

rows = []
for location_code in d['data']:
    if len(location_code.split('US')[-1]) == 5:
        zipcode = location_code.split('US')[-1]
        row = {'zip':zipcode}
        male_idx = list(sorted(d['data'][location_code][tbl_id]['estimate'].keys()))
        
        for colkey in sorted(d['data'][location_code][tbl_id]['estimate'].keys()):
            
            row[columns_to_names[colkey]] = d['data'][location_code][tbl_id]['estimate'][colname]
            #print columns_to_names[colname], "\t", d['data'][location_code][tbl_id]['estimate'][colname]
        rows.append(row)

# NYC Age & Male/Female Demographics by zip code         
df = pd.DataFrame(rows).groupby('zip').sum()
df.columns = [df.columns[-1]] + list(df.columns[:-1])
#df = df.transpose()
#df.tail()

In [None]:
df.T.loc['07036']

In [None]:
# NYC Male/Female Dempgraphics by zip code
df = df.transpose()
df[['Female','Male']]

In [None]:
zip_codes = [x.split('US')[-1] for x in list((d['data']).iterkeys())]
print zip_codes

In [None]:
print d['data']