# Collect data

### Cesus dot gov

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import requests
import json
import time

# Import API key
from config import news_api_key, census_api_key

### NAICS codes table

    2012~ : there are sub categories
    Before 2012, the number of employeements is available in total, no sub categories.
   
    For sub category bar chart, data from 2012 will be displayed.
    
    For timeline chart, data from 1986, number of employ will be displayed.

In [2]:
# NAICS2017 codes and labels:  CBP data of 2017~2018 refer to NICS2017 table for business labels
url = "https://api.census.gov/data/2018/cbp/variables/NAICS2017.json"
result = requests.get(url).json()
naics_items = result["values"]["item"]
naics2017 = { ky: naics_items[ky] for ky in naics_items.keys() if len(ky)==2}

# NAICS2012 codes and labels : CBP data of 2012~2016 refer to NICS2012 table for business labels
url = "https://api.census.gov/data/2016/cbp/variables/NAICS2012.json"
result = requests.get(url).json()
naics_items = result["values"]["item"]
naics2012 = { ky: naics_items[ky] for ky in naics_items.keys() if len(ky)==2}

# Write the tables to json files
with open("naics2017.json", "w") as outfile:  
    json.dump(naics2017, outfile)

with open("naics2012.json", "w") as outfile:  
    json.dump(naics2012, outfile)

### County codes and names

In [3]:
# NC County names and codes
url = "https://api.census.gov/data/2018/cbp?get=NAME&for=county:*&in=state:37&key="+census_api_key
result = requests.get(url).json()

In [6]:
df = pd.DataFrame(result, columns=result[0])
county_df=df.drop(0).set_index("county").drop(["state"], axis=1)
county_df.head()

Unnamed: 0_level_0,NAME
county,Unnamed: 1_level_1
79,"Greene County, North Carolina"
87,"Haywood County, North Carolina"
121,"Mitchell County, North Carolina"
41,"Chowan County, North Carolina"
45,"Cleveland County, North Carolina"


In [7]:
# Write the table to json file
#county_df.to_json("county_codes.json", orient="table", index=False)
county_df.to_json("county_codes.json", orient="records")

### Set the variables, keys for API calls


In [83]:
## The quary variables vary in years

def set_url(year):
    cbp_url = f'https://api.census.gov/data/{year}/cbp?get='
    if (year >= 2017):
        variables = "NAICS2017,NAME,EMP"
    elif (year >= 2012):
        variables = "NAICS2012,GEO_TTL,EMP"
    elif (year > 2007):
        variables = "NAICS2007_TTL,GEO_TTL,EMP"
    elif (year > 2002):
        variables = "NAICS2002_TTL,GEO_TTL,EMP"
    elif (year > 1997):
        variables = "NAICS1997_TTL,GEO_TTL,EMP"
    else:
        variables = "GEO_TTL,EMP"
        
    return cbp_url+variables+"&for=county:*&in=state:37&key="+census_api_key


### Perform API calls

In [84]:
#query_string = "?get=NAICS2007_TTL,GEO_TTL,EMP,LFO_TTL,ESTAB&for=us:*&key=" + census_api_key
#url = "https://api.census.gov/data/2018/cbp?get=NAICS2017,GEO_ID,NAME,EMP&for=county:*&in=state:37&key="

url = set_url(2012)

try:
    response = requests.get(url)
    print(response)
    census_data = response.json()
    #print(json.dumps(census_data, indent=4))
except:
    print(f"Found error")    

<Response [200]>


In [70]:
df = pd.DataFrame(census_data, columns=census_data[0])
emp_df=df.drop(0).sort_values(by="EMP", ignore_index=True, ascending=False)
emp_df.head()

Unnamed: 0,NAICS2012,GEO_TTL,EMP,state,county
0,32311,"Wake County, North Carolina",999,37,183
1,236118,"Mecklenburg County, North Carolina",999,37,119
2,4461,"Cumberland County, North Carolina",999,37,51
3,446,"Cumberland County, North Carolina",999,37,51
4,42,"Durham County, North Carolina",9980,37,63


In [78]:
emp_df["NAICS2012"].astype("string")

0         32311
1        236118
2          4461
3           446
4            42
          ...  
80414     44411
80415    444110
80416    445292
80417    444120
80418     81331
Name: NAICS2012, Length: 80419, dtype: string

In [76]:
emp_df.dtypes

NAICS2012    object
GEO_TTL      object
EMP          object
state        object
county       object
dtype: object