# Collect data

### Cesus dot gov

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import requests
import json
import time

# Import API key
from config import census_api_key

### NAICS codes table

    2012~ : there are sub categories
    Before 2012, the number of employeements is available in total, no sub categories.
   
    For sub category bar chart, data from 2012 will be displayed.
    
    For timeline chart, data from 1986, number of employ will be displayed.

In [2]:
# NAICS2017 codes and labels:  CBP data of 2017~2018 refer to NICS2017 table for business labels
url = "https://api.census.gov/data/2018/cbp/variables/NAICS2017.json"
result = requests.get(url).json()
naics_items = result["values"]["item"]
naics2017 = { ky: naics_items[ky] for ky in naics_items.keys() if len(ky)==2}

# NAICS2012 codes and labels : CBP data of 2012~2016 refer to NICS2012 table for business labels
url = "https://api.census.gov/data/2016/cbp/variables/NAICS2012.json"
result = requests.get(url).json()
naics_items = result["values"]["item"]
naics2012 = { ky: naics_items[ky] for ky in naics_items.keys() if len(ky)==2}

# Write the tables to json files
with open("naics2017.json", "w") as outfile:  
    json.dump(naics2017, outfile)

with open("naics2012.json", "w") as outfile:  
    json.dump(naics2012, outfile)

### County codes and names

* From census,

In [7]:
# NC County names and codes from CENSUS
url = "https://api.census.gov/data/2018/cbp?get=NAME&for=county:*&in=state:37&key="+census_api_key
result = requests.get(url).json()

In [8]:
df = pd.DataFrame(result, columns=result[0])
county_df=df.drop(0).drop(["state"], axis=1)
county_df.head()

Unnamed: 0,NAME,county
1,"Greene County, North Carolina",79
2,"Haywood County, North Carolina",87
3,"Mitchell County, North Carolina",121
4,"Chowan County, North Carolina",41
5,"Cleveland County, North Carolina",45


In [9]:
# Write the table to json file
#county_df.to_json("county_codes.json", orient="table", index=False)
county_df.to_json("county_codes.json", orient="columns")

* From NC map GeoJSON

In [None]:
# Save the county properties from NC map geojson to a json file
def retrieve_county_Geojson():

    response = requests.get("https://opendata.arcgis.com/datasets/d192da4d0ac249fa9584109b1d626286_0.geojson")

    # Get county codes

    json_data = response.json()

    ctdata = json_data['features']
    ctarray = []
    for ct in ctdata:
        ctarray.append(ct['properties'])
    with open("county_codes2.json", "w") as ft:
        json.dump(ctarray, ft)

## Combine the county codes

In [11]:
# Geojson codes
gct_df = pd.read_json("county_codes2.json")
gct_n_df = gct_df[['CountyName', 'SAP_CNTY_NBR', 'CNTY_NBR']]
gct_n_df.head()

Unnamed: 0,CountyName,SAP_CNTY_NBR,CNTY_NBR
0,Camden,15,14
1,Gates,37,36
2,Iredell,49,48
3,Wilkes,97,96
4,Union,90,89


In [10]:
# Census codes
cct_df = pd.read_json("county_codes.json")
cct_df.head()

Unnamed: 0,NAME,county
1,"Greene County, North Carolina",79
2,"Haywood County, North Carolina",87
3,"Mitchell County, North Carolina",121
4,"Chowan County, North Carolina",41
5,"Cleveland County, North Carolina",45


In [17]:
name = [ nm.split()[0]  for nm in cct_df['NAME'].values ]
cct_df['CountyName'] = name
cct_df.head()

Unnamed: 0,NAME,county,CountyName
1,"Greene County, North Carolina",79,Greene
2,"Haywood County, North Carolina",87,Haywood
3,"Mitchell County, North Carolina",121,Mitchell
4,"Chowan County, North Carolina",41,Chowan
5,"Cleveland County, North Carolina",45,Cleveland


In [33]:
combined = pd.merge(gct_n_df, cct_df, on='CountyName', how='inner')
combined.rename(columns={"CNTY_NBR" : "Geo_NBR", "county":"Census_NBR"}, inplace=True)
combined.head()

Unnamed: 0,CountyName,SAP_CNTY_NBR,Geo_NBR,NAME,Census_NBR
0,Camden,15,14,"Camden County, North Carolina",29
1,Gates,37,36,"Gates County, North Carolina",73
2,Iredell,49,48,"Iredell County, North Carolina",97
3,Wilkes,97,96,"Wilkes County, North Carolina",193
4,Union,90,89,"Union County, North Carolina",179


In [34]:
new_df = combined[['CountyName', 'Geo_NBR', 'Census_NBR']]
new_df.set_index('CountyName', inplace=True)
new_df.head()

Unnamed: 0_level_0,Geo_NBR,Census_NBR
CountyName,Unnamed: 1_level_1,Unnamed: 2_level_1
Camden,14,29
Gates,36,73
Iredell,48,97
Wilkes,96,193
Union,89,179


In [37]:
#  new_df.to_dict('index')
new_df.to_json("combined_county_codes.json", orient="index")

## Set variables, url for API calls

In [2]:
# Check the NAICS codes which have only 2 digits

with open("naics2017.json", "r") as fileobj:  
    naics2017 = json.load(fileobj)

print(naics2017.keys())

with open("naics2012.json", "r") as fileobj:  
    naics2012 = json.load(fileobj)

print(naics2012.keys())

dict_keys(['00', '11', '21', '22', '23', '42', '51', '52', '53', '54', '55', '56', '61', '62', '71', '72', '81', '95', '99'])
dict_keys(['00', '11', '21', '22', '23', '42', '51', '52', '53', '54', '55', '56', '61', '62', '71', '72', '81', '95', '99'])


In [3]:
## Set a naics code query to restrict the API call only for 2-digit business codes
def set_naics_query(year):
    
    # The NAICES codes for higher level business
    NAICS_codes = ['00', '11', '21', '22', '23', '42', '51', '52', '53', '54', '55', '56', '61', '62', '71', '72', '81', '95', '99']
    code_query = ""
    for code in NAICS_codes:
        if (year >= 2017):
            code_query += f'&NAICS2017={code}'
        elif (year >= 2012):
            code_query += f'&NAICS2012={code}'
    return code_query


## Set a url for given year to retrieve employee data for all counties.
def set_url(year):
    
    cbp_url = f'https://api.census.gov/data/{year}/cbp?get='
    
    ## The quary variables vary in years
    if (year >= 2017):
        variables = "NAME,EMP"
    elif (year >= 2012):
        variables = "GEO_TTL,EMP"
    elif (year > 2007):
        variables = "NAICS2007_TTL,GEO_TTL,EMP"
    elif (year > 2002):
        variables = "NAICS2002_TTL,GEO_TTL,EMP"
    elif (year > 1997):
        variables = "NAICS1997_TTL,GEO_TTL,EMP"
    else:
        variables = "GEO_TTL,EMP"
    
    url = cbp_url+variables+"&for=county:*&in=state:37&key="+census_api_key
    
    # From 2012, there are subcategories upto 2~6 digits in NAICS codes. We collect only 2 digits codes.
    if (year >= 2012):
        url += set_naics_query(year)
    
    return url

In [15]:
set_url(2014)

'https://api.census.gov/data/2012/cbp?get=GEO_TTL,EMP&for=county:*&in=state:37&key=c27d20165731bd731fe0b28ba84169ac2877e759&NAICS2012=00&NAICS2012=11&NAICS2012=21&NAICS2012=22&NAICS2012=23&NAICS2012=42&NAICS2012=51&NAICS2012=52&NAICS2012=53&NAICS2012=54&NAICS2012=55&NAICS2012=56&NAICS2012=61&NAICS2012=62&NAICS2012=71&NAICS2012=72&NAICS2012=81&NAICS2012=95&NAICS2012=99'

### Perform API calls

* By year

In [4]:
url = set_url(2014)

try:
    response = requests.get(url)
    print(response)
    census_data = response.json()
    #print(json.dumps(census_data, indent=4))
except:
    print(f"Found error")    

<Response [200]>


In [5]:
df = pd.DataFrame(census_data, columns=census_data[0])
emp_df=df.drop(0).drop("state",axis=1)
emp_df

Unnamed: 0,GEO_TTL,EMP,NAICS2012,county
1,"Alamance County, North Carolina",51102,00,001
2,"Alamance County, North Carolina",0,11,001
3,"Alamance County, North Carolina",47,21,001
4,"Bladen County, North Carolina",0,51,017
5,"Alamance County, North Carolina",102,22,001
...,...,...,...,...
1722,North Carolina -- Statewide,110011,56,999
1723,North Carolina -- Statewide,0,61,999
1724,North Carolina -- Statewide,1574,62,999
1725,North Carolina -- Statewide,0,71,999


In [18]:
emp_df.loc[emp_df['county']=='183']

Unnamed: 0,NAME,EMP,NAICS2017,county
833,"Wake County, North Carolina",9419,53,183
834,"Wake County, North Carolina",24891,42,183
835,"Wake County, North Carolina",57526,54,183
836,"Wake County, North Carolina",27811,51,183
837,"Wake County, North Carolina",26709,52,183
838,"Wake County, North Carolina",478998,0,183
839,"Wake County, North Carolina",174,11,183
840,"Wake County, North Carolina",225,21,183
841,"Wake County, North Carolina",3219,22,183
842,"Wake County, North Carolina",30205,23,183


### Perform API calls

* By county, collect data for all years

In [19]:
# Function: County data for all years
# returns a table of given county with total employees for 1986~2018
def county_all_years(county):
    census = []
    for year in np.arange(1986,2018):
        
        #print(year)
        
        cbp_url = f'https://api.census.gov/data/{year}/cbp?get='
        if (year >= 2017):
            variables = "NAICS2017,EMP"
        elif (year >= 2012):
            variables = "NAICS2012_TTL,EMP"
        elif (year > 2007):
            variables = "NAICS2007_TTL,EMP"
        elif (year > 2002):
            variables = "NAICS2002_TTL,EMP"
        elif (year > 1997):
            variables = "NAICS1997_TTL,EMP"
        else:
            variables = "GEO_TTL,EMP"
            
        url = cbp_url+variables+f'&for=county:{county}&in=state:37&key='+census_api_key
        if (year >= 2017):
            url = url + "&NAICS2017=00"
            
        #print(url)
        
        try:
            response = requests.get(url)
            #print(response)
            year_data = response.json()
            #print(json.dumps(year_data, indent=4))
            census.append(year_data[1][1])
        except:
            print(f"Found error")
        

    return census

In [20]:
emp_ct = county_all_years('183')
emp_ct

['171707',
 '180159',
 '188429',
 '199991',
 '214939',
 '215685',
 '220516',
 '227468',
 '243889',
 '255873',
 '272570',
 '288340',
 '300548',
 '317994',
 '332744',
 '355071',
 '340064',
 '340802',
 '347541',
 '355516',
 '369539',
 '383282',
 '389698',
 '375345',
 '365534',
 '373012',
 '394910',
 '409573',
 '425283',
 '431040',
 '453333',
 '468449']

In [21]:
emp_ct_df = pd.DataFrame({
    "County" : ["183"]*len(emp_ct),
    "Year" : np.arange(1986,2018),
    "EMP" : emp_ct
})
emp_ct_df

Unnamed: 0,County,Year,EMP
0,183,1986,171707
1,183,1987,180159
2,183,1988,188429
3,183,1989,199991
4,183,1990,214939
5,183,1991,215685
6,183,1992,220516
7,183,1993,227468
8,183,1994,243889
9,183,1995,255873


In [2]:
url ="https://api.census.gov/data/2018/cbp?get=NAICS2017_LABEL,NAICS2017,EMP&for=state:37&key="+census_api_key

In [3]:
try:
    response = requests.get(url)
    print(response)
    census_data = response.json()
    #print(json.dumps(census_data, indent=4))
except:
    print(f"Found error")    

<Response [200]>


In [4]:
df = pd.DataFrame(census_data, columns=census_data[0])
df.head()

Unnamed: 0,NAICS2017_LABEL,NAICS2017,EMP,state
0,NAICS2017_LABEL,NAICS2017,EMP,state
1,Architectural and structural metals manufacturing,3323,11663,37
2,Logging,113310,2751,37
3,"Fishing, hunting and trapping",114,31,37
4,"Electric power generation, transmission and di...",2211,18817,37
