# collect-employee-data
sarora@air.org<br>
Recent updates made january 2019<br>

## Description
* First go through a list of firm names and via Google's custom search api get linkedin and other employee data.  Save these search results in mongo
* Next parse these results for an estimate of the number of employees a firm has.  
* Finally write output to file

In [None]:
import csv
import re
import pprint
from googleapiclient.discovery import build
import textdistance
import collections
from config import connection_string
from config import username
from config import password
from config import authSource
from config import authMechanism
import pymongo
from urllib.parse import urlparse
from statistics import median
import textdistance
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [2]:
# input data
f_in = '/Users/sarora/dev/EAGER/data/orgs/depth0.csv'
with open(f_in, 'r') as f:
    reader = csv.reader(f)
    firm_url_list = list(reader)

# output data to mongodb
mdbc = pymongo.MongoClient(connection_string, username=username, password=password, authSource=authSource, authMechanism=authMechanism)
db = mdbc["EAGER"]
col = db["employeeResults"]

pp = pprint.PrettyPrinter()

f_out = '/Users/sarora/dev/EAGER/data/orgs/emps/depth0+emps.csv'

In [3]:
from enum import Enum 
class FirmSize (Enum):
    UNDEFINED = -1
    MICRO_FIRM = 10
    SMALL_FIRM = 50
    EU_MEDIUM_FIRM = 250
    US_MEDIUM_FIRM = 500
    LARGE_FIRM = 999999
    
    @classmethod
    def classify_state (self, size):
        if not size:
            return FirmSize.UNDEFINED
        elif size < FirmSize.MICRO_FIRM.value:
            return FirmSize.MICRO_FIRM
        elif size < FirmSize.SMALL_FIRM.value and size >= FirmSize.MICRO_FIRM.value:
            return FirmSize.SMALL_FIRM
        elif size < FirmSize.EU_MEDIUM_FIRM.value and size >= FirmSize.SMALL_FIRM.value:
            return FirmSize.EU_MEDIUM_FIRM
        elif size < FirmSize.US_MEDIUM_FIRM.value and size >= FirmSize.EU_MEDIUM_FIRM.value:
            return FirmSize.US_MEDIUM_FIRM
        else:
            return FirmSize.LARGE_FIRM
    
    def __sub__(self, other):     
        if self.__class__ is other.__class__:
            self_index = [el for el in FirmSize].index(self)
            other_index = [el for el in FirmSize].index(other)
            return abs(self_index - other_index)
        return 0
    def __ge__(self, other):
        if self.__class__ is other.__class__:
            return self.value >= other.value
        return NotImplemented
    def __gt__(self, other):
        if self.__class__ is other.__class__:
            return self.value > other.value
        return NotImplemented
    def __le__(self, other):
        if self.__class__ is other.__class__:
            return self.value <= other.value
        return NotImplemented
    def __lt__(self, other):
        if self.__class__ is other.__class__:
            return self.value < other.value
        return NotImplemented    
    
print (FirmSize.classify_state (None))
print (FirmSize.classify_state (5))
print (FirmSize.classify_state (55))
print (FirmSize.classify_state (116))
print (FirmSize.classify_state (255))
print (FirmSize.classify_state (600))

print (FirmSize.US_MEDIUM_FIRM - FirmSize.MICRO_FIRM)

FirmSize.UNDEFINED
FirmSize.MICRO_FIRM
FirmSize.EU_MEDIUM_FIRM
FirmSize.EU_MEDIUM_FIRM
FirmSize.US_MEDIUM_FIRM
FirmSize.LARGE_FIRM
3


## Get employment data from Google CSE API
Data pulled from linkedin specifically and the general web (Bloomberg, Forbes, Wikipedia, etc.) in two separate searches

In [None]:
# 1. Search LinkedIn

# set up google search api 
service = build("customsearch", "v1",
                developerKey="AIzaSyCGSAZr9pYt_ALZBryuVwwpOjxCR4k8-TQ") 

for row in firm_url_list:
    firm = row[0]
    url = row[1]
    firm_clnd = re.sub('(\.|,| corporation| incorporated| llc| inc| international| gmbh| ltd)', '', firm, flags=re.IGNORECASE).rstrip()
    if len(firm_clnd) <= 2:
        firm_clnd = firm

    search_term = firm_clnd + ' "see all"'

    if len(firm_clnd) < 5:
        url_clnd = re.sub('(http(s)?://)|/', '', url).lstrip()
        search_term = search_term + ' ' + url_clnd

    print("Working on ", search_term)

    try:
        results = service.cse().siterestrict().list( # find
            q=search_term,
            cx='007721750960636651249:ad5w3ishg9q',
        ).execute()
        
        results['firm_name'] = firm
        # pp.pprint (results)

       # insert into mongo
        try:
            col.insert(results, check_keys=False)
        except Exception as e:
            print('\tCannot add search result into mongodb!')
            print(e)
            
    except Exception as e:
        print('\tCannot search url!')
        print(e)

In [None]:
# 2. Search other sites

# set up google search api 
service = build("customsearch", "v1",
                developerKey="AIzaSyCGSAZr9pYt_ALZBryuVwwpOjxCR4k8-TQ") 

for row in firm_url_list:
    firm = row[0]
    url = row[1]

    search_term = firm + " number of \"employees\""

    print("Working on ", search_term)

    try:
        results = service.cse().siterestrict().list( # find
            q=search_term,
            cx='007721750960636651249:cqqnoxcjdrs',
        ).execute()

        results['firm_name'] = firm
        # pp.pprint (results)

       # insert into mongo
        try:
            col.insert(results, check_keys=False)
        except Exception as e:
            print('\tCannot add search result into mongodb!')
            print(e)
            
    except Exception as e:
        print('\tCannot search url!')
        print(e)

## Create tf-idf matrix for firm names
Used below to weight matches from non linked-in results

In [4]:
# standard firm cleaning regex

def clean_firm_name (firm):
    firm_clnd = re.sub('(\.|,|&| corporation| incorporated| llc| inc| international| gmbh| ltd)', '', firm, flags=re.IGNORECASE).rstrip()
    return firm_clnd

clnd_firm_names = list(set([clean_firm_name(rec[0]) for rec in firm_url_list ]))
# pp.pprint (corpus)

ubv = TfidfVectorizer(min_df=0., max_df=1.)
ubv_matrix = ubv.fit_transform(clnd_firm_names)

ubv_matrix = ubv_matrix.toarray()
vocab = ubv.get_feature_names()
ubv_df = pd.DataFrame(ubv_matrix, columns=vocab)
ubv_df.index = clnd_firm_names

ubv_df

Unnamed: 0,22nd,3d,3m,66,ab,abb,abbott,abbvie,abengoa,ablexis,...,zena,zeno,zephyr,zero,zih,ziptronix,zoetis,zon,zygo,zyvex
Fairchild Semiconductor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Molecular Rebar Design,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kimberly-Clark Worldwide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Phoseon Technology,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Lake Lite,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pfizer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALSTOM Technology,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Masimo Semiconductor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GOLBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MicroContinuum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Parse results from mongo
* Look for employment data in linkedin and general web results
* Choose which employee figures to use for a given firm (generally the max between linked in and other sources) 
* Correct for subsidiaries by examining domain similarity 
* Print out to fil

In [5]:
FUZZY_THRESHOLD = .90 # fuzzy match threshold for cleaned firm name word vis-a-vis title result word from bloomberg, etc.
MATCH_THRESHOLD = .50 # overall match threshold for firm words in the search result title words

# pattern matches
li_p = re.compile(r'\b\d+\b')
gw_p1 = re.compile(r'(\d+)\s?(\(?consolidated\)?\s)?employees', re.IGNORECASE)
gw_p2 = re.compile(r'employees[\.:]?\s?~?(\d+)+?', re.IGNORECASE)

def ngrams(string):
    words = re.findall(r'\w+', string)
    return words

def get_domain (url):
    o = urlparse(url.lower())
    domain = o.netloc.strip('www.')
    return domain

def parse_linkedin_results (res):
    li_dict = collections.defaultdict(dict)

    if 'items' not in res:
        return

    items = res["items"]

    rank = 1
    for i in range(len(items)): # go through
        link = items[i].get("link", None)
        if "/showcase/" in link or "/in/" in link or "/.../" in link:
            continue
        else:
            pass

        snippet = items[i].get("snippet", None)
        title = items[i].get("title", None) # formatted "company name | LinkedIn"
        li_firm = title.split('|')[0].rstrip()

        # code below sets up a data structure that can be used to select result based on number of 
        # employees, search engine rank, etc. 
        # currently based on rank order, but can change to jaro-winkler, employee size, etc.
        snippet_clnd = re.sub("[,.]", "", snippet)
        emps = re.findall(li_p, snippet_clnd)
        if emps:
            li_dict[li_firm]["emps"] = int(emps[0])
        else:
            li_dict[li_firm]["emps"] = 0

        li_dict[li_firm]["link"] = link
        li_dict[li_firm]["rank"] = rank
        rank += 1

    srtd_keys = sorted(li_dict, key=lambda x: (li_dict[x]['rank']))
    
    if not srtd_keys:
        return None
    else:
        match_key = srtd_keys[0]
        return li_dict[match_key]["emps"]
    
def parse_general_web_results (res, firm_name):
    if 'items' not in res:
        return

    items = res["items"]
    emps = []
    
    firm_name_clnd = clean_firm_name(firm_name)
    firm_words = ngrams(firm_name_clnd)

    match_dict = collections.defaultdict(int) # key is domain, value is count of employees

    for i in range(len(items)): # go through
        link = items[i].get("link", None)
        snippet = items[i].get("snippet", None)
        title = items[i].get("title", None)
        link = items[i].get("link", None)
        
        snippet_clnd = re.sub("[,.\n\r\t]", "", snippet)
        title_clnd = re.split("[|:-]", title)[0] # title delimeters for hoovers, wikipedia, bloomberg, forbes
        title_clnd = title_clnd.replace('Company Profile', '') # for hoovers      
        
        # ignore forbes articles
        if re.search ('forbes.com', link, re.IGNORECASE): 
            if re.search ('on the Forbes', title_clnd):
                title_clnd = re.split ("on the Forbes", title_clnd)[0] 
            else:
                continue
                
        # ignore news articles from bloomberg        
        if re.search (r'www.bloomberg.com/research/stocks/news/', link, re.IGNORECASE): 
            continue
                
        title_words = title_clnd.split()

        # logic of remaining code
        # if unigram of firm name fuzzy matches in the search result title or snippet 
        # AND its composite tf-idf summation is > MATCH_THRESHOLD: 
            # get domain
            # get recent employee size (or 0)
            # if current emp size > than existing
                # replace emp size
            # flatten dict values and return as list

        match = 0.0
        total_rate = 0.001
        # n-gram matching
        for fw in firm_words:
            tf_idf_score = 1
            try:
                tf_idf_score = ubv_df.loc[firm_name_clnd][fw.lower()]
            except:
                print ('\tCannot get tf_idf score for fw:', fw)
            match_ratios = [textdistance.jaro_winkler(fw.lower(), x.lower()) for x in title_words]
            fuzzy_match = list(filter(lambda x: x > FUZZY_THRESHOLD, match_ratios))
            total_rate += tf_idf_score
            if (fuzzy_match) and len(fw) > 2:
                match += tf_idf_score
              
        if match/total_rate < MATCH_THRESHOLD:
            continue
            
        domain = get_domain(link)
        
        # check for number of employees with two different regex patterns
        emp_matches1 = gw_p1.search(snippet_clnd)
        if emp_matches1:
            num_emps = int(emp_matches1.group(1))
            if num_emps > match_dict[domain]:
                match_dict[domain] = num_emps
            
        emp_matches2 = gw_p2.search(snippet_clnd)
        if emp_matches2:
            num_emps = int(emp_matches2.group(1))
            if num_emps > match_dict[domain]:
                match_dict[domain] = num_emps
                
    return sorted(match_dict.values())

In [6]:
print (re.search(gw_p1, "Nippon has 254 consolidated employees"))
print (re.search(gw_p1, "Nippon has 254 (consolidated) employees"))
print (re.search(gw_p1, "Nippon has 254 employees"))
print (re.search(gw_p2, "Nippon has employees: 3000"))
print (re.search(gw_p2, "Number of employees. 34,500"))

if re.search (r'www.bloomberg.com/research/stocks/news/', 'https://www.bloomberg.com/research/stocks/news/article.asp?docKey=600-201810271000KRTRIB__BUSNEWS_50545_12615-1&ex=true&ticker=ALQA', re.IGNORECASE): 
    print ('Match')

<_sre.SRE_Match object; span=(11, 37), match='254 consolidated employees'>
<_sre.SRE_Match object; span=(11, 39), match='254 (consolidated) employees'>
<_sre.SRE_Match object; span=(11, 24), match='254 employees'>
<_sre.SRE_Match object; span=(11, 26), match='employees: 3000'>
<_sre.SRE_Match object; span=(10, 23), match='employees. 34'>
Match


In [7]:
match_fuzzy = list(filter(lambda x: x > 0.9 ,[textdistance.jaro_winkler('Achushnet'.lower(), x.lower()) for x in ['Achushnet', 'Company']]))
if match_fuzzy:
    print ('Match')
else:
    print ('Not a match')

print (textdistance.jaro_winkler('Achushnet', 'Acushnet'))

Match
0.9203703703703703


In [13]:
# pull data from mongo and generate employee measures 

output = []
output.append (["firm_name", "url", "li_emps", "median_gw_emps", "emp_diff", "max_emps", "size_state", "state_diff"])

for row in firm_url_list:
    firm_name = row[0]
    url = row[1]
    
    print("Working on ", firm_name)
    regex = '^' + re.escape(firm_name) + '$'
    results = col.find( {"firm_name": re.compile(regex, re.IGNORECASE)}).sort("depth",pymongo.ASCENDING) 
    li_emps = None
    median_gw_emps = None
    for result in results: # should just be two
        # pp.pprint (result)
        search_terms = result['queries']['request'][0]['searchTerms']
        if re.search ("see all", search_terms):
            ret_emps = parse_linkedin_results(result)
            if ret_emps:
                li_emps = ret_emps
        else:
            gw_emps = parse_general_web_results (result, firm_name)
            if gw_emps:
                print('\t', gw_emps)
                median_gw_emps = median(gw_emps)
    
    # rules for what constitutes a small, medium, and large firm
    li_state = FirmSize.classify_state (li_emps)
    gw_state = FirmSize.classify_state (median_gw_emps)
    
    max_emps = None
    if li_emps and median_gw_emps: 
        max_emps = max (li_emps, median_gw_emps)
    elif not li_emps: 
        max_emps = median_gw_emps
    elif not median_gw_emps: 
        max_emps = li_emps
    
    emp_diff = None 
    if li_emps and median_gw_emps:
        emp_diff = abs(li_emps - median_gw_emps)  
    
    state_diff = li_state - gw_state
    size_state = max(li_state, gw_state)

    output.append(([firm_name, url, li_emps, median_gw_emps, emp_diff, max_emps, size_state, state_diff]))
    print ('\t', firm_name, li_emps, median_gw_emps, emp_diff, max_emps, size_state, state_diff)


Working on  Two Blades Foundation
	 Two Blades Foundation 6 None None 6 FirmSize.MICRO_FIRM 1
Working on  3M Innovative Properties Company
	 3M Innovative Properties Company 60593 None None 60593 FirmSize.LARGE_FIRM 5
Working on  Advanced Aqua Group
	 Advanced Aqua Group 3 None None 3 FirmSize.MICRO_FIRM 1
Working on  ABB AB
	 ABB AB 100487 None None 100487 FirmSize.LARGE_FIRM 5
Working on  AbbVie Inc.
	 [29000, 29000, 29000, 30000]
	 AbbVie Inc. 26260 29000.0 2740.0 29000.0 FirmSize.LARGE_FIRM 0
Working on  Google Inc.
	 [85050]
	 Google Inc. 8 85050 85042 85050 FirmSize.LARGE_FIRM 4
Working on  Abengoa Bioenergy New Technologies
	 [154]
	 Abengoa Bioenergy New Technologies 5799 154 5645 5799 FirmSize.LARGE_FIRM 2
Working on  Ablexis
	 Ablexis 2 None None 2 FirmSize.MICRO_FIRM 1
Working on  SII Semiconductor Corporation
	 SII Semiconductor Corporation 3 None None 3 FirmSize.MICRO_FIRM 1
Working on  ACACIA RESEARCH GROUP LLC
	 [13]
	 ACACIA RESEARCH GROUP LLC 2 13 11 13 FirmSize.SMALL_

	 [943]
	 Arrowhead Center 9 943 934 943 FirmSize.LARGE_FIRM 4
Working on  ASCENT SOLAR TECHNOLOGIES
	 [71, 150]
	 ASCENT SOLAR TECHNOLOGIES 82 110.5 28.5 110.5 FirmSize.EU_MEDIUM_FIRM 0
Working on  ASM America
	 [1503, 1670]
	 ASM America 1236 1586.5 350.5 1586.5 FirmSize.LARGE_FIRM 0
Working on  ASM IP HOLDING B.V.
	 ASM IP HOLDING B.V. None None None None FirmSize.UNDEFINED 0
Working on  ASM IP Holdings LLC
	 ASM IP Holdings LLC 14940 None None 14940 FirmSize.LARGE_FIRM 5
Working on  ASML Netherlands B.V.
	 ASML Netherlands B.V. 18233 None None 18233 FirmSize.LARGE_FIRM 5
Working on  Astech
	 [12, 4437]
	 Astech 3 2224.5 2221.5 2224.5 FirmSize.LARGE_FIRM 4
Working on  Advanced Cell Technology
	 Advanced Cell Technology 7 None None 7 FirmSize.MICRO_FIRM 1
Working on  AstenJohnson
	 [80]
	 AstenJohnson 528 80 448 528 FirmSize.LARGE_FIRM 2
Working on  ASTUTE MEDICAL
	 ASTUTE MEDICAL 59 None None 59 FirmSize.EU_MEDIUM_FIRM 3
Working on  Astex Pharmaceuticals
	 [136]
	 Astex Pharmaceutic

	 Braun Intertec Geothermal 796 None None 796 FirmSize.LARGE_FIRM 5
Working on  Brewer Science Inc.
	 [300]
	 Brewer Science Inc. 358 300 58 358 FirmSize.US_MEDIUM_FIRM 0
Working on  Bridgelux
	 Bridgelux 148 None None 148 FirmSize.EU_MEDIUM_FIRM 3
Working on  Bridgestone Corporation
	 [142669]
	 Bridgestone Corporation 1241 142669 141428 142669 FirmSize.LARGE_FIRM 0
Working on  Brightfield Transportation Solutions
	 Brightfield Transportation Solutions 6 None None 6 FirmSize.MICRO_FIRM 1
Working on  BRIGHTLEAF TECHNOLOGIES INC.
	 BRIGHTLEAF TECHNOLOGIES INC. 31 None None 31 FirmSize.SMALL_FIRM 2
Working on  BROADCOM CORPORATION
	 [14000, 15000]
	 BROADCOM CORPORATION 12 14500.0 14488.0 14500.0 FirmSize.LARGE_FIRM 3
Working on  Brother International Corporation
	 [340, 7100, 33118]
	 Brother International Corporation 4 7100 7096 7100 FirmSize.LARGE_FIRM 4
Working on  Bruin Biometrics
	 Bruin Biometrics 24 None None 24 FirmSize.SMALL_FIRM 2
Working on  Brookhaven Science Associates LLC 

	 Courtagen Life Sciences 16 None None 16 FirmSize.SMALL_FIRM 2
Working on  COVERIS FLEXIBLES US LLC
	 [12]
	 COVERIS FLEXIBLES US LLC 14 12 2 14 FirmSize.SMALL_FIRM 0
Working on  Chevron Phillips Chemical Company LP
	 [600, 5000, 5000]
	 Chevron Phillips Chemical Company LP 3923 5000 1077 5000 FirmSize.LARGE_FIRM 0
Working on  CP KELCO APS
	 CP KELCO APS 1249 None None 1249 FirmSize.LARGE_FIRM 5
Working on  CP Kelco U.S.
	 CP Kelco U.S. 1249 None None 1249 FirmSize.LARGE_FIRM 5
Working on  Community Power Corporation
	 Community Power Corporation 15 None None 15 FirmSize.SMALL_FIRM 2
Working on  Cedar Ridge Research
	 Cedar Ridge Research 9 None None 9 FirmSize.MICRO_FIRM 1
Working on  Cree
	 [6387]
	 Cree 3034 6387 3353 6387 FirmSize.LARGE_FIRM 0
Working on  Crestovo LLC
	 Crestovo LLC None None None None FirmSize.UNDEFINED 0
Working on  Centre de Recherche Industrielle du Quebec
	 Centre de Recherche Industrielle du Quebec 125 None None 125 FirmSize.EU_MEDIUM_FIRM 3
Working on  Cris

	 Pioneer Hi-Bred International 6 None None 6 FirmSize.MICRO_FIRM 1
Working on  D-Wave Systems Inc.
	Cannot get tf_idf score for fw: D
	Cannot get tf_idf score for fw: D
	Cannot get tf_idf score for fw: D
	Cannot get tf_idf score for fw: D
	Cannot get tf_idf score for fw: D
	Cannot get tf_idf score for fw: D
	Cannot get tf_idf score for fw: D
	Cannot get tf_idf score for fw: D
	Cannot get tf_idf score for fw: D
	 D-Wave Systems Inc. 184 None None 184 FirmSize.EU_MEDIUM_FIRM 3
Working on  Dymax Corporation
	 [59]
	 Dymax Corporation 191 59 132 191 FirmSize.EU_MEDIUM_FIRM 0
Working on  Dynamic Solutions Worldwide
	 Dynamic Solutions Worldwide 23 None None 23 FirmSize.SMALL_FIRM 2
Working on  Easel Biotechnologies
	 Easel Biotechnologies 311 None None 311 FirmSize.US_MEDIUM_FIRM 4
Working on  The Eastern Co.
	 The Eastern Co. 27 None None 27 FirmSize.SMALL_FIRM 2
Working on  Eastman Chemical Company
	 [14000, 14000, 14000]
	 Eastman Chemical Company 8938 14000 5062 14000 FirmSize.LARGE_FI

	 [858, 78150, 79000]
	 FUJIFILM Corporation 108 78150 78042 78150 FirmSize.LARGE_FIRM 2
Working on  FUJIFILM Dimatix
	 FUJIFILM Dimatix 413 None None 413 FirmSize.US_MEDIUM_FIRM 4
Working on  Fujitsu Limited
	 [140000]
	 Fujitsu Limited 2070 140000 137930 140000 FirmSize.LARGE_FIRM 0
Working on  Fujitsu Semiconductor Limited
	 Fujitsu Semiconductor Limited 167 None None 167 FirmSize.EU_MEDIUM_FIRM 3
Working on  Fuji Xerox Co.
	 [8023]
	 Fuji Xerox Co. 314 8023 7709 8023 FirmSize.LARGE_FIRM 1
Working on  FULL CIRCLE BIOCHAR
	 FULL CIRCLE BIOCHAR 2 None None 2 FirmSize.MICRO_FIRM 1
Working on  Furukawa Electric Co.
	 [45425, 51925, 51925]
	 Furukawa Electric Co. 215 51925 51710 51925 FirmSize.LARGE_FIRM 2
Working on  fybr
	 fybr 23 None None 23 FirmSize.SMALL_FIRM 2
Working on  Galectin Therapeutics
	 [7]
	 Galectin Therapeutics 54 7 47 54 FirmSize.EU_MEDIUM_FIRM 2
Working on  Galemed Corporation
	 Galemed Corporation 33 None None 33 FirmSize.SMALL_FIRM 2
Working on  GangaGen
	 GangaGen

	 HGST NETHERLANDS B.V. 4913 None None 4913 FirmSize.LARGE_FIRM 5
Working on  HIQ SOLAR
	 HIQ SOLAR 15 None None 15 FirmSize.SMALL_FIRM 2
Working on  Hitachi High-Technologies Corporation
	 [10898, 10898]
	 Hitachi High-Technologies Corporation 62 10898.0 10836.0 10898.0 FirmSize.LARGE_FIRM 2
Working on  Hitachi
	 [303887, 335244]
	 Hitachi 10457 319565.5 309108.5 319565.5 FirmSize.LARGE_FIRM 0
Working on  Hitachi Metals
	 [2103, 30390]
	 Hitachi Metals 276 16246.5 15970.5 16246.5 FirmSize.LARGE_FIRM 1
Working on  The Henry M. Jackson Foundation for the Advancement of Military Medicine
	Cannot get tf_idf score for fw: M
	Cannot get tf_idf score for fw: M
	Cannot get tf_idf score for fw: M
	Cannot get tf_idf score for fw: M
	Cannot get tf_idf score for fw: M
	Cannot get tf_idf score for fw: M
	Cannot get tf_idf score for fw: M
	Cannot get tf_idf score for fw: M
	Cannot get tf_idf score for fw: M
	Cannot get tf_idf score for fw: M
	 [250]
	 The Henry M. Jackson Foundation for the Advance

	 [12000]
	 International Technology Center 10 12000 11990 12000 FirmSize.LARGE_FIRM 3
Working on  ITN Energy Systems
	 ITN Energy Systems 28 None None 28 FirmSize.SMALL_FIRM 2
Working on  Industrial Technology Research Institute
	 [6000]
	 Industrial Technology Research Institute 6 6000 5994 6000 FirmSize.LARGE_FIRM 4
Working on  Illinois Tool Works
	 [400, 50000, 50000, 50000]
	 Illinois Tool Works 5604 50000.0 44396.0 50000.0 FirmSize.LARGE_FIRM 0
Working on  Ivoclar Vivadent AG
	 Ivoclar Vivadent AG 1269 None None 1269 FirmSize.LARGE_FIRM 5
Working on  JOLED Inc.
	 JOLED Inc. 4 None None 4 FirmSize.MICRO_FIRM 1
Working on  JAC Products Inc.
	 [36, 6511]
	 JAC Products Inc. 629 3273.5 2644.5 3273.5 FirmSize.LARGE_FIRM 0
Working on  Janssen Biotech
	 [1200, 28000]
	 Janssen Biotech 17858 14600.0 3258.0 17858 FirmSize.LARGE_FIRM 0
Working on  The Jackson Laboratory
	 [1350, 2100]
	 The Jackson Laboratory None 1725.0 None 1725.0 FirmSize.LARGE_FIRM 5
Working on  J. E. WHITE
	Cannot get

	 [100, 78277, 78277]
	 MAHLE International GmbH 8 78277 78269 78277 FirmSize.LARGE_FIRM 4
Working on  Mainstream Engineering Corp.
	 Mainstream Engineering Corp. 86 None None 86 FirmSize.EU_MEDIUM_FIRM 3
Working on  Manta Instruments
	 Manta Instruments 8 None None 8 FirmSize.MICRO_FIRM 1
Working on  Marathon Equipment Company
	 Marathon Equipment Company 44 None None 44 FirmSize.SMALL_FIRM 2
Working on  Marine Biotech Inc.
	 [36]
	 Marine Biotech Inc. 9 36 27 36 FirmSize.SMALL_FIRM 1
Working on  Marine Polymer Technologies
	 [35]
	 Marine Polymer Technologies 34 35 1 35 FirmSize.SMALL_FIRM 0
Working on  Markel Corporation
	 [15600, 15600, 15600, 15600]
	 Markel Corporation 2385 15600.0 13215.0 15600.0 FirmSize.LARGE_FIRM 0
Working on  Masimo Semiconductor
	 [3500]
	 Masimo Semiconductor 1966 3500 1534 3500 FirmSize.LARGE_FIRM 0
Working on  Materia
	 [120]
	 Materia 81 120 39 120 FirmSize.EU_MEDIUM_FIRM 0
Working on  Johnson Matthey PLC
	 [12214, 12306, 12348, 14130]
	 Johnson Matthey

	 Nthdegree Technologies Worldwide Inc. 19 None None 19 FirmSize.SMALL_FIRM 2
Working on  New England Biolabs
	 [30, 350]
	 New England Biolabs 490 190.0 300.0 490 FirmSize.US_MEDIUM_FIRM 1
Working on  NEC Corporation
	 [190, 100914, 109390]
	 NEC Corporation 8430 100914 92484 100914 FirmSize.LARGE_FIRM 0
Working on  NeoPhotonics Corporation
	 [1783]
	 NeoPhotonics Corporation 526 1783 1257 1783 FirmSize.LARGE_FIRM 0
Working on  NeoVision LLC
	 NeoVision LLC 3 None None 3 FirmSize.MICRO_FIRM 1
Working on  Nestec S. A.
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf sc

	 Pardev None None None None FirmSize.UNDEFINED 0
Working on  Parion Sciences
	 Parion Sciences 19 None None 19 FirmSize.SMALL_FIRM 2
Working on  Parker-Hannifin Corporation
	 Parker-Hannifin Corporation 19843 None None 19843 FirmSize.LARGE_FIRM 5
Working on  PAX Scientific
	 [974]
	 PAX Scientific 10 974 964 974 FirmSize.LARGE_FIRM 3
Working on  The Paymaster Corporation
	 The Paymaster Corporation 2 None None 2 FirmSize.MICRO_FIRM 1
Working on  Professional Compounding Centers of America
	 Professional Compounding Centers of America None None None None FirmSize.UNDEFINED 0
Working on  PDF Solutions
	 PDF Solutions 479 None None 479 FirmSize.US_MEDIUM_FIRM 4
Working on  Pegatron Corporation
	 [5646, 177948]
	 Pegatron Corporation 5060 91797.0 86737.0 91797.0 FirmSize.LARGE_FIRM 0
Working on  PELLION TECHNOLOGIES
	 PELLION TECHNOLOGIES 22 None None 22 FirmSize.SMALL_FIRM 2
Working on  Pendar Technologies
	 Pendar Technologies 25 None None 25 FirmSize.SMALL_FIRM 2
Working on  Pentron Cl

	 [400]
	 Roche Molecular Systems 70192 400 69792 70192 FirmSize.LARGE_FIRM 1
Working on  Roche Diabetes Care
	 [350]
	 Roche Diabetes Care 213 350 137 350 FirmSize.US_MEDIUM_FIRM 1
Working on  Roche Innovation Center Copenhagen A/S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	Cannot get tf_idf score for fw: A
	Cannot get tf_idf score for fw: S
	 Roche Innovation Center Copenhagen A/S 16 None None 16 FirmSize.S

	 SK Hynix Inc. 143 None None 143 FirmSize.EU_MEDIUM_FIRM 3
Working on  Skidmore
	 [30, 246]
	 Skidmore 1409 138.0 1271.0 1409 FirmSize.LARGE_FIRM 2
Working on  SK Telecom Co.
	 SK Telecom Co. 8145 None None 8145 FirmSize.LARGE_FIRM 5
Working on  Schlumberger Technology Corporation
	 [1957, 100000]
	 Schlumberger Technology Corporation 36 50978.5 50942.5 50978.5 FirmSize.LARGE_FIRM 3
Working on  Smart Planet Technologies
	 Smart Planet Technologies 18 None None 18 FirmSize.SMALL_FIRM 2
Working on  Surebeam Corporation
	 [130]
	 Surebeam Corporation 3 130 127 130 FirmSize.EU_MEDIUM_FIRM 2
Working on  Smith & Nephew
	 [1849, 15000, 15000, 15933]
	 Smith & Nephew 9975 15000.0 5025.0 15000.0 FirmSize.LARGE_FIRM 0
Working on  SNS NANO FIBER TECHNOLOGY
	 SNS NANO FIBER TECHNOLOGY 8 None None 8 FirmSize.MICRO_FIRM 1
Working on  SB ENERGY
	 SB ENERGY 94 None None 94 FirmSize.EU_MEDIUM_FIRM 3
Working on  SOL-ELECTRICA
	 SOL-ELECTRICA None None None None FirmSize.UNDEFINED 0
Working on  Sola U.S

	 [81519, 394998, 394998, 400875]
	 Tata Consultancy Services Limited 380007 394998.0 14991.0 394998.0 FirmSize.LARGE_FIRM 0
Working on  Tyco Electronics Corporation
	 [69000]
	 Tyco Electronics Corporation 24819 69000 44181 69000 FirmSize.LARGE_FIRM 0
Working on  Tyco Electronics U.K. LTD
	 Tyco Electronics U.K. LTD 24819 None None 24819 FirmSize.LARGE_FIRM 5
Working on  EXOS LLC
	 [551, 5000]
	 EXOS LLC 471 2775.5 2304.5 2775.5 FirmSize.LARGE_FIRM 1
Working on  TECNIUM
	 TECNIUM 13 None None 13 FirmSize.SMALL_FIRM 2
Working on  TEKNOR APEX COMPANY
	 [450]
	 TEKNOR APEX COMPANY 769 450 319 769 FirmSize.LARGE_FIRM 1
Working on  Tokyo Electron Limited
	 [11946, 11946, 12304]
	 Tokyo Electron Limited 2343 11946 9603 11946 FirmSize.LARGE_FIRM 0
Working on  Tela Innovations
	 Tela Innovations 15 None None 15 FirmSize.SMALL_FIRM 2
Working on  Teledyne Scientific & Imaging
	 Teledyne Scientific & Imaging 210 None None 210 FirmSize.EU_MEDIUM_FIRM 3
Working on  Cetac Technologies Inc.
	 Cetac 

	 [202797, 204700, 205000]
	 United Technologies Corporation 7892 204700 196808 204700 FirmSize.LARGE_FIRM 0
Working on  Hamilton Sundstrand Corporation
	 [800, 17158]
	 Hamilton Sundstrand Corporation 2490 8979.0 6489.0 8979.0 FirmSize.LARGE_FIRM 0
Working on  UVIC INDUSTRY PARTNERSHIPS INC.
	 UVIC INDUSTRY PARTNERSHIPS INC. 26 None None 26 FirmSize.SMALL_FIRM 2
Working on  UWM Research Foundation
	 UWM Research Foundation 63 None None 63 FirmSize.EU_MEDIUM_FIRM 3
Working on  Vadient Optics
	 Vadient Optics 2 None None 2 FirmSize.MICRO_FIRM 1
Working on  Abbott Cardiovascular Systems Inc.
	 [652]
	 Abbott Cardiovascular Systems Inc. 706 652 54 706 FirmSize.LARGE_FIRM 0
Working on  Vascular BioSciences
	 Vascular BioSciences 9 None None 9 FirmSize.MICRO_FIRM 1
Working on  Vaxiion Therapeutics
	 Vaxiion Therapeutics 5 None None 5 FirmSize.MICRO_FIRM 1
Working on  VEECO PRECISION SURFACE PROCESSING LLC
	 [90]
	 VEECO PRECISION SURFACE PROCESSING LLC 67 90 23 90 FirmSize.EU_MEDIUM_FIRM 0


In [15]:
headers = output.pop(0)
output_df = pd.DataFrame(output, columns=headers)
output_df

Unnamed: 0,firm_name,url,li_emps,median_gw_emps,emp_diff,max_emps,size_state,state_diff
0,Two Blades Foundation,2blades.org/,6.0,,,6.0,FirmSize.MICRO_FIRM,1
1,3M Innovative Properties Company,3m.com/,60593.0,,,60593.0,FirmSize.LARGE_FIRM,5
2,Advanced Aqua Group,aadvancedaqua.com/,3.0,,,3.0,FirmSize.MICRO_FIRM,1
3,ABB AB,abb.com/,100487.0,,,100487.0,FirmSize.LARGE_FIRM,5
4,AbbVie Inc.,abbvie.com/,26260.0,29000.0,2740.0,29000.0,FirmSize.LARGE_FIRM,0
5,Google Inc.,abc.xyz/,8.0,85050.0,85042.0,85050.0,FirmSize.LARGE_FIRM,4
6,Abengoa Bioenergy New Technologies,abengoa.com/,5799.0,154.0,5645.0,5799.0,FirmSize.LARGE_FIRM,2
7,Ablexis,ablexis.com/,2.0,,,2.0,FirmSize.MICRO_FIRM,1
8,SII Semiconductor Corporation,ablic.com/en/semicon/,3.0,,,3.0,FirmSize.MICRO_FIRM,1
9,ACACIA RESEARCH GROUP LLC,acaciaresearch.com/,2.0,13.0,11.0,13.0,FirmSize.SMALL_FIRM,1


In [20]:
# correct for subsidiaries by mapping to base domains, i.e., make employee measures consistent using the largest entity
def get_firm_base_domain (url):
    fixed_url = 'http://' + url
    o = urlparse(fixed_url.lower())
    domain = o.netloc.strip('www.')
    return domain

output_copy = output_df
output_copy['base_domain'] = output_copy['url'].apply(get_firm_base_domain)
grouped = output_copy.groupby('base_domain')['firm_name'].nunique()
dups = list(grouped[grouped > 1].index)
output_copy['subsidiary_adjust'] = 0

for dup in dups:
    # get max employee size and state
    size_states = output_copy.loc[output_copy['base_domain'] == dup, ['size_state']].values
    emps = output_copy.loc[output_copy['base_domain'] == dup, ['max_emps']].values
    max_size = max(size_states)
    max_emp = max(emps)
    # set all size values to max_size
    output_copy.loc[output_copy['base_domain'] == dup, ['size_state']] = max_size
    output_copy.loc[output_copy['base_domain'] == dup, ['max_emps']] = max_emp
    output_copy.loc[output_copy['base_domain'] == dup, ['subsidiary_adjust']] = 1

In [21]:
# write out
output_copy.to_csv(f_out, index=False)