# 2.2 Industry Definition Data

by Constantin Knoll, Christopher Mosch, Rohan Thavarajah

## Summary

**Goal** - Scrape the Census Bureau's webpage for the industry definitions of the North American Industrial Classification System (NAICS 2002) and output a list of dictionaries. Each dictionary will have a high-level parent naics, and the nouns in all the definitions of its lower-level children

**NAICS overview** - The NAICS 2002 is a hierarchical classifiction of industry. Each 2 digit naics is comprised of a set of 3 digit naics, each 3 digit naics of 4 digit naics and so on. The Census Bureau lays out their website to reflect this. If you are interested in finding the definition of a 6 digit naics, you must first select its parent and drill down to it. Therefore we have 4 steps

- Step 1 - step through every tier of Census Bureau definitions and compile a list of urls to terminal definition pages
- Step 2 - from each page fetch title, conceptual definition and items 
- Step 3 - set the definition of parents = the sum of definitions of descendents
- Step 4 - construct a noun dictionary for each 3-digit naics

**Output signature** - We wish to construct a list of dictionaries. Each dictionary will reflect a 3 digit NAICS and have the following signature:
![Image](Data/Images/2.2 naics_get_nouns_output_signature.png?raw=true)

![Image](Data/Images/Workflow_2.2.png?raw=true)

## Table of Contents

* <a href='#Step1'>Step1</a>
* <a href='#Step2'>Step2</a>
* <a href='#Step3'>Step3</a>
* <a href='#Step4'>Step4</a>


In [None]:
%matplotlib inline
import numpy as np
import scipy as sp
import pandas as pd
import time
import string
import re
import collections
import json
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
from unidecode import unidecode
from collections import defaultdict
from pattern.en import tag


<a id='Step1'></a>
**Step 1 - step through every tier of Census Bureau definitions and compile a list of urls to terminal definition pages**

In [154]:
# tier1 - first page
tier1_req = requests.get("http://www.census.gov/cgi-bin/sssd/naics/naicsrch?chart=2002")
tier1_soup = BeautifulSoup(tier1_req.text, "html.parser")
sector = []
description = []
url = []
table_raw = tier1_soup.findAll("tr")
table_raw_drp_frst_rw = [row[1] for row in enumerate(table_raw) if row[0]>0]

for row in table_raw_drp_frst_rw:
    try:
        sector.append(row.find("td").get_text().strip())
        description.append(row.find("td").find("a").get("title").strip())
        url.append(row.find("td").find("a").get("href").strip())
    except:
        pass
tier1 = zip(sector,description,url)
fields = ['sector1','description1','url1']
tier1_table = [dict(zip(fields,row)) for row in tier1]

In [155]:
# tier2 - step into second set of pages
list_of_links = []
for i in tier1_table:
    time.sleep(1)
    tier2_req = requests.get("http://www.census.gov/"+i['url1'])
    tier2_soup = BeautifulSoup(tier2_req.text, "html.parser")
    
    for row in tier2_soup.findAll("tr"):
        link_to_append = row.find("a").get("href").strip()
        naics_to_append = link_to_append
        naics_to_append = naics_to_append.replace("/cgi-bin/sssd/naics/naicsrch?code=","")
        naics_to_append = naics_to_append.replace("&search=2002 NAICS Search","")
        list_of_links.append((naics_to_append, link_to_append))
    

<a id='Step2'></a>
**Step 2 - from each page fetch title, conceptual definition and items**
- we are careful not to fetch the "exclusions" listed in each definition

In [83]:
url_trial = ('311320','cgi-bin/sssd/naics/naicsrch?code=311320&search=2002%20NAICS%20Search')
'''
input = tuple
input[0] = naics
input[1] = url
output = tuple 
tuple[0] = naics code
tuple[1] = naics industry title
tuple[2] = conceptual definition (string)
tuple[3] = constituent items (string)
'''
def fetch_ind_definition(input_tuple):
    input_naics = input_tuple[0]
    url = "http://www.census.gov/"+input_tuple[1]
    
    # data fetch
    time.sleep(1)
    tier3_req = requests.get(url)
    tier3_soup = BeautifulSoup(tier3_req.text, "html.parser")
    
    if len(input_naics)==2:
        # naics industry code and title
        naics_stuff = unidecode(tier3_soup.findAll("h3")[3].get_text().strip()).split('--',1)
        naics_code = naics_stuff[0].replace("Sector ","")
        naics_title = naics_stuff[1]
        # conceptual definition 
        conceptual_def = tier3_soup.findAll("div", { "class" : "inside" })[1].get_text()
        conceptual_def = conceptual_def.replace('\n', ' ') 
        conceptual_def = conceptual_def.replace('\r', ' ') 
        conceptual_def = unidecode(conceptual_def)
        regex_space=re.compile(r"\ {2,}")
        conceptual_def = re.sub(regex_space, ' ', conceptual_def)
        conceptual_def = conceptual_def.split("The Sector as a Whole ")[1]
        # constituent_items
        constituent_items = ""
    elif len(input_naics)==6:
        # naics industry code and title
        naics_stuff = unidecode(tier3_soup.findAll("h3")[3].get_text().strip()).split(' ',1)
        naics_code = naics_stuff[0]
        naics_title = naics_stuff[1]
        # conceptual definition 
        conceptual_def = tier3_soup.findAll("div", { "class" : "inside" })[1].h3.next_sibling.strip()
        # constituent_items
        table_rows_raw = tier3_soup.findAll("tr")
        table_rows = [row[1] for row in enumerate(table_rows_raw) if row[0]!=0]
        items_list = [row.findAll("td")[6].get_text().strip() for row in table_rows]
        constituent_items = ""
        for item in items_list:
            constituent_items = constituent_items + " " + item 
    else:
        # naics industry code and title
        naics_stuff = unidecode(tier3_soup.findAll("h3")[4].get_text().strip()).split(' ',1)
        naics_code = naics_stuff[0]
        naics_title = naics_stuff[1]
        # conceptual definition 
        conceptual_def = tier3_soup.findAll("h3")[4].next_sibling.strip()
        if conceptual_def=="See industry description for":
            conceptual_def = ""        
        # constituent_items
        constituent_items = ""
            
    # sanity check
    if input_naics!=naics_code:
        print "input and output naics conflict: " + input_naics + " vs. " + naics_code    

    return (input_naics, naics_title, conceptual_def, constituent_items)
    
fetch_ind_definition(url_trial)

('311320',
 'Chocolate and Confectionery Manufacturing from Cacao Beans',
 u'This industry comprises establishments primarily engaged in shelling, roasting, and grinding cacao beans and making chocolate cacao products and chocolate confectioneries.',
 u' Baking chocolate made from cacao beans Candy bars, chocolate (including chocolate covered), made from cacao beans Candy, chocolate, made from cacao beans Chocolate (e.g.,  coatings, instant, liquor, syrups) made from cacao beans Chocolate bars made from cocoa beans Chocolate, confectionery, made from cacao beans Coatings, chocolate, made from cacao beans Cocoa (e.g., instant, mix, mixed with other ingredients, powder drink, powdered) made from cacao beans Cocoa butter made from cocoa beans Confectionery chocolate made from cacao beans Cooking chocolate made from cacao beans Drink powdered mixes, cocoa, made from cacao Fudge, chocolate, made from cacao beans Granola bars and clusters, chocolate, made from cacao beans Liquor, chocolate, 

In [161]:
# tier3 - step into third set of pages
tier3_defs = []
fields = ['naics_code', 'naics_title', 'conceptual_def', 'constituent_items']
for i in list_of_links:
    tier3_defs.append(zip(fields,fetch_ind_definition(i)))

input and output naics conflict: 31 vs. 31-33
input and output naics conflict: 44 vs. 44-45
input and output naics conflict: 48 vs. 48-49


In [168]:
tier3_dict = [dict(i) for i in tier3_defs]

In [173]:
# save to json
fd = open("data/industry definitions/naics_raw_web.json","w")
json.dump(tier3_dict, fd)
fd.close()

In [3]:
# reload json
with open("data/industry definitions/naics_raw_web.json", "r") as fd:
    tier3_dict = json.load(fd)
tier3_dict[37]    

{u'conceptual_def': u'This U.S. industry comprises establishments primarily engaged in growing mushrooms under cover in mines underground, or in other controlled environments.',
 u'constituent_items': u' Mushroom farming Mushroom spawn farming Shitake mushroom farming',
 u'naics_code': u'111411',
 u'naics_title': u'Mushroom Production'}

<a id='Step3'></a>
**Step 3 - set the definition of parents = the sum of definitions of descendents**
- we do this as generally as possible so as to allow 4-digit or 5-digit NAICS definitions to be compiled in the same way.
- 3 digit naics are chosen to facilitate comparisons with the USPTO baseline 

In [61]:
'''
groupby_parent takes multi-leveled naics data and aggregates it to a high-level parent
input = list of dictionaries (eg tier3_dict) with at least one key named 'naics_code' per dict
aggregate_var = string - key in input we wish to group
aggregate_tiers = list of ints - which naics tiers to include in grouping. e.g. [4,6] means group only 4 and 6 digit naics
output_lvl = int - the desired number of digits for the high-level parent
'''
def groupby_parent(input, aggregate_var, aggregate_tiers, output_lvl):
    mapped1 = [(row['naics_code'][:output_lvl], row['naics_code'], row[aggregate_var]) for row in input]    
    d = defaultdict(str)
    for k, lvl, v in mapped1:
        if len(lvl) in aggregate_tiers:
            d[k] += v + " "
    output = dict(zip(d.keys(),d.values()))
    return output   


In [None]:
#exception_list = ['31-33','44-45','48-49']
#tier3_dict2 = []
#for i in tier3_dict:
#    if i['naics_code'] not in exception_list:
#        tier3_dict2.append(i)
        

In [85]:
# grab 3 digit data
agg_naics_code = groupby_parent(tier3_dict, 'naics_code', [3,4,5,6], 3)
agg_naics_title = groupby_parent(tier3_dict, 'naics_title', [3,4,5,6], 3)
agg_conceptual_def = groupby_parent(tier3_dict, 'conceptual_def', [3,4,5,6], 3)
agg_constituent_items = groupby_parent(tier3_dict, 'constituent_items', [6], 3)

# consolidate
ds = [agg_naics_code, agg_naics_title, agg_conceptual_def, agg_constituent_items]

txt_data_split = []
fields = ['naics_code_parent','agg_naics_code','agg_naics_title','agg_conceptual_def','agg_constituent_items']
for k in agg_naics_code.iterkeys():
    values = [k]
    for i in ds:
        values.append(i[k])
    txt_data_split.append(dict(zip(fields,values)))

In [117]:
txt_data_split[3]

{'agg_conceptual_def': u"The Postal Service subsector includes the activities of the National Post Office and its subcontractors in delivering letters and small parcels, normally without pick-up at the sender's location. These articles can be described as those that can be handled by one person without using special equipment. This allows the collection, pick-up, and delivery operations to be done with limited labor costs and minimal equipment. Sorting and transportation activities, where necessary, are generally mechanized. The restriction to small parcels distinguishes these establishments from those in the transportation industries.   This industry comprises establishments primarily engaged in operating the National Postal Service. Establishments primarily engaged in performing one or more postal services, such as sorting, routing, and/or delivery, on a contract basis (except the bulk transportation of mail) are included in this industry. ",
 'agg_constituent_items': u' Postal deliv

<a id='Step3'></a>
**Step 4 - construct a noun dictionary for each 3-digit naics**

In [None]:
# for consistency this is almost identical to Chris's code. tocorpus is modified
def get_nouns(text):  
    tagged = tag(text.lower(), tokenize=True)   
    nouns = [a for (a, b) in tagged if b == 'NN']
    return nouns

# for each noun in list of nouns: get its number and number of occurences in list
def tocorpus(nouns):
    count = defaultdict(int) # to count number of occurences of a noun
    for noun in nouns:
        count[noun] +=1  # for new nouns: creates new key, sets value to 1. for existing keys: increases value by 1
    return count

In [119]:
# consolidate strings
naics_nouns = []
fields = ['naics_code','agg_naics_code','noun_dict']
for i in txt_data_split:
    values0 = i['naics_code_parent']
    values1 = i['agg_naics_code']
    values2 = i['agg_naics_title']+" "+i['agg_conceptual_def']+" "+i['agg_constituent_items']
    values2 = dict(tocorpus(get_nouns(values2)))
    values = [values0, values1, values2]
    naics_nouns.append(dict(zip(fields,values)))

In [120]:
naics_nouns[3]

{'agg_naics_code': u'491 4911 49111 491110 ',
 'naics_code': u'491',
 'noun_dict': {u'basis': 3,
  u'collection': 1,
  u'contract': 3,
  u'delivery': 4,
  u'equipment': 2,
  u'industry': 2,
  u'labor': 1,
  u'location': 1,
  u'mail': 1,
  u'office': 1,
  u'person': 1,
  u'post': 1,
  u'restriction': 1,
  u'sender': 1,
  u'service': 9,
  u'subsector': 1,
  u'transportation': 3}}

In [127]:
# save to json
fd = open("data/industry definitions/naics_nouns.json","w")
json.dump(naics_nouns, fd)
fd.close()