In [1]:
# Intended to assess patent intensity and R&D intensity for Web of Innovation panel
# Sanjay K Arora
# June 2019

import pprint
import sys
import pprint
import csv
import pandas as pd
import re
import os
from collections import defaultdict 

In [17]:
# set this home data dir
BASE_DATA_DIR = "/Users/kg284kt/dev/EAGER/data/"

in_path = BASE_DATA_DIR + 'orgs/parsed_page_output/' # where files are stored
out_file = BASE_DATA_DIR + 'analysis/measures/keyword_counts_v2.csv'

files = os.listdir(in_path)
files = [item for item in files if item.endswith('.txt')]

pp = pprint.PrettyPrinter()

In [3]:
#generate pooled text and list of processed documents 
combined_text = []
documents_dict = {}
for i in range(len(files)):
    with open(in_path + files[i], 'r') as my_file:
        text = my_file.readlines() # this is each separate paragraph including headers etc.
        for k in ["\n", "\t", " "]: # remove the end of sentence tags
            text = [item.strip(k) for item in text]
        text = list(filter(lambda x : not x in [' ', '', '\t'],text)) #remove the paragraphs that are just blank space
        documents_dict[files[i]] = text #make a list of lists

In [4]:
pp.pprint (documents_dict['PolyOne.txt'][0:10])

['EN',
 'English',
 '简体中文',
 'pусский',
 '日本語',
 '+1-866-737-2066',
 '+1-866-737-2066',
 'COMPANY',
 '|',
 'INVESTORS']


In [5]:
# pattern regex to remove unwanted words that show up in topic models
p = re.compile(r"(\(\)|''|``|\"|null|ul|li|ol|^\.|^:|^/|\\|--|cooki|'s|corpor|busi|inc\.|ltd|co\.|compan|keyboard|product|technolog)", flags=re.IGNORECASE)

def encode_item(text):
    '''
    Remove characters with encoding problems 
    '''
    clean = ""
    for item in text.split(" "):
        try:
            clean += item.encode('ascii','ignore').decode('utf-8') + " "
        except: 
            pass
    return clean.rstrip()

def remove_dups(text):
    '''
    Deal with endcoding and return stemmed/tokenized text
    '''
    seen = {}
    running_text = ''
    for ac in text:
        if ac not in seen:
            seen[ac] = 1
            running_text = running_text + ' ' + ac
        
    return running_text

In [6]:
def process_doc_list(dict_of_docs):
    '''
    input: lists of documents (which are individually lists of paragraphs)
    output: returns a test/train split of paragraphs and stemmed documents
    '''
    combined_text = []
    dict_of_processed_docs = {}
    print ("Working on " + str(len(dict_of_docs.keys())) + " firm documents")

    for key in dict_of_docs:
        document = dict_of_docs[key]
        # print ("Working on doc " + key)

        joined = remove_dups(document)
        
        dict_of_processed_docs[key] = encode_item (joined)
    
    return dict_of_processed_docs

In [7]:
all_docs_dict = process_doc_list (documents_dict)
pp.pprint (all_docs_dict['PolyOne.txt'])

Working on 1142 firm documents
(' EN English  p  +1-866-737-2066 COMPANY | INVESTORS CAREERS GLOBAL US Poly '
 'One PRODUCTS DISTRIBUTION INKS WILFLEX NON-INKS Wilflex Epic Non-Phthalate '
 'Plastisol Inks Wilflex Oasis Water Based Inks Wilflex One Non-Plastisol Inks '
 'Wilflex Originals Wilflex Equipment Solutions Wilflex Compliance Information '
 'ORIGINALS SOLUTIONS INFORMATION SOFTWARE TIPS DISTRIBUTORS ZODIAC AQUARIUS '
 'AQUARIUS INK Aquarius Discharge Ink Aquarius High Mesh Aquarius High Solid '
 'Ink Aquarius Soft Base Ink Aquarius Special Effect Compliance Screen '
 'Printing Inks Distributors AQUARIUS MESH AQUARIUS EFFECT COMPLIANCE '
 'COMPOSITES FIBER-COMPOSITES Short Fiber-Reinforced Thermoplastic Composites '
 'reSound Natural Fiber Reinforced Thermoplastic Composites OnForce Long Fiber '
 'Reinforced Composites TECHNOLOGIES Pultrusion Technologies Continuous '
 'Filament Winding Panels GlasArmor Ballistic Resistant Panels Markers and '
 'Delineators Pullwinding Technolo

In [24]:
# iterate through files and count regex 
patent_pattern = re.compile(r'patent', re.IGNORECASE)
rd_pattern = re.compile(r'lab |laboratory|research|development|R&D|researcher|scientist', re.IGNORECASE)
product_pattern = re.compile(r'buy|order|sale|sell|brand|model|version|catalog|product', re.IGNORECASE)
trial_pattern = re.compile(r'experimental|experimentation|expirement|exploratory|pilot|prelim|preliminary|provisional|tentative|test|testing|trial|in-process', re.IGNORECASE)
demo_pattern = re.compile(r'demo|affirmation|confirmation|exhibition|exposition|expression|illustration|presentation|proof|showing|substantiation|test|validation', re.IGNORECASE)
manufacturing_pattern = re.compile(r'manufacture|manufacturing|production|producing|produce|assemble|assembly|fabricate|prefabricate|machine|mold|prefab|process|construct|weld|engineer', re.IGNORECASE)
investment_pattern = re.compile(r'investor|invest|angel|investment firm|private equity|bank|adventure capitalist|blind pool|bridge loan|buyout|corporate venture capital|corporate venturing|deal flow|debt financing|direct financing|drive-by deal|due diligence|equity financing|financier|full ratchet|fund of funds|institutional investors|IRR|lead investor|leveraged buy-out|LBO|liquidity event|lock-up period|management buy-in|management buy-out|master limited partnership|mezzanine debt|mezzanine financing|mezzanine level|owner-employee|pitch|portfolio company|private equity|private placement|raising capital|recapitalization|resyndication|risk capital|investment company|secondary purchase|silent partner|startup|syndication|term sheet|turnaround|underwriter|vulture capitalist', re.IGNORECASE)
university_pattern = re.compile(r'university|college|institute|academy', re.IGNORECASE)
partnership_pattern = re.compile(r'partner|stakeholder|distributor|collaboration|collaborator|alliance|joint venture|agreement|supplier|parts manufacturer', re.IGNORECASE)
greenness_pattern = re.compile(r'green|sustainable|sustainability|eco-friendly|environmentally friendly|envonmentally conscious|renewable|renew|clean air|clean coal|clean energy|clean fuel|clean technology|cleantech|climate change|global warming|rising temperature', re.IGNORECASE)
customization_pattern = re.compile(r'custom |customize|customization|made to order|made-to-order|bespoke|tailor', re.IGNORECASE)
awards_pattern = re.compile(r'award|prize|ceritifcation|certify|certified|grant|sbir|sbtt', re.IGNORECASE)
membership_pattern = re.compile(r'member|membership|affiliation|association|club|participation', re.IGNORECASE)
customer_pattern = re.compile(r'customer|consumer|end user|purchaser|client|clientele|customer-base|buyer', re.IGNORECASE)

search_str = "My 11 Char String laboratory research R&D work scientist patents product trial demo manufacture venture capital investment university partner climate change customization award membership award customer"

pp_res = re.findall(patent_pattern, search_str)
print(len(pp_res))
rdp_res = re.findall(rd_pattern, search_str)
print(len(rdp_res))
prod_res = re.findall(product_pattern, search_str)
print(len(prod_res))
trial_res = re.findall(trial_pattern, search_str)
print(len(trial_res))
demo_res = re.findall(demo_pattern, search_str)
print(len(demo_res))
manu_res = re.findall(manufacturing_pattern, search_str)
print(len(manu_res))
venture_res = re.findall(venture_pattern, search_str)
print(len(venture_res))
inv_res = re.findall(investment_pattern, search_str)
print(len(inv_res))
uni_res = re.findall(university_pattern, search_str)
print(len(uni_res))
partner_res = re.findall(partnership_pattern, search_str)
print(len(partner_res))
green_res = re.findall(greenness_pattern, search_str)
print(len(green_res))
customization_res = re.findall(customization_pattern, search_str)
print(len(customization_res))
awards_res = re.findall(awards_pattern, search_str)
print(len(awards_res))
member_res = re.findall(membership_pattern, search_str)
print(len(member_res))
customer_res = re.findall(customer_pattern, search_str)
print(len(customer_res))

1
4
1
1
1
1
1
1
1
1
1
1
2
1
1


In [11]:
res_df = pd.DataFrame (all_docs_dict.keys(),index=(range(len(all_docs_dict.keys()))), columns=["Name"])
res_df['patent_pattern'] = 0
res_df['rd_pattern'] = 0
res_df['product_pattern'] = 0
res_df['trial_pattern'] = 0
res_df['demo_pattern'] = 0
res_df['manufacturing_pattern'] = 0
res_df['venture_pattern'] = 0
res_df['investment_pattern'] = 0
res_df['university_pattern'] = 0
res_df['partnership_pattern'] = 0
res_df['greenness_pattern'] = 0
res_df['customization_pattern'] = 0
res_df['awards_pattern'] = 0
res_df['membership_pattern'] = 0
res_df['customer_pattern'] = 0

display (res_df.head())

Unnamed: 0,Name,patent_pattern,rd_pattern,product_pattern,trial_pattern,demo_pattern,manufacturing_pattern,venture_pattern,investment_pattern,university_pattern,partnership_pattern,greenness_pattern,customization_pattern,awards_pattern,membership_pattern,customer_pattern
0,Chevron USA.txt,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Roche Diagnostics.txt,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Smart Planet Technologies.txt,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,SRG Global.txt,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,PolyOne.txt,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
print (res_df.shape[0])
print (res_df.iloc[0]['Name'])

1142
Chevron USA.txt


In [13]:
def count_regex ():
    regex_matches = {}
    for i in range(0, res_df.shape[0]):
        key = (res_df.iloc[i]['Name'])
        print ("Working on doc " + key)
        document = all_docs_dict[key]

        pp_res = re.findall(patent_pattern, document)
        # print(len(pp_res)) # len = 4 
        rdp_res = re.findall(rd_pattern, document)
        # print(len(rdp_res)) # len = 4 
        prod_res = re.findall(product_pattern, document)
        # print(len(prod_res)) # len = 4 
        trial_res = re.findall(trial_pattern, document)
        # print(len(trial_res)) # len = 4 
        demo_res = re.findall(demo_pattern, document)
        # print(len(demo_res)) # len = 4 
        manu_res = re.findall(manufacturing_pattern, document)
        # print(len(manu_res)) # len = 4 
        venture_res = re.findall(venture_pattern, document)
        # print(len(venture_res)) # len = 4 
        inv_res = re.findall(investment_pattern, document)
        # print(len(inv_res)) # len = 4 
        uni_res = re.findall(university_pattern, document)
        # print(len(uni_res)) # len = 4 
        partner_res = re.findall(partnership_pattern, document)
        # print(len(partner_res)) # len = 4 
        green_res = re.findall(greenness_pattern, document)
        # print(len(green_res)) # len = 4 
        customization_res = re.findall(customization_pattern, document)
        # print(len(customization_res)) # len = 4 
        awards_res = re.findall(awards_pattern, document)
        # print(len(awards_res)) # len = 4 
        member_res = re.findall(membership_pattern, document)
        # print(len(member_res)) # len = 4 
        customer_res = re.findall(customer_pattern, document)
        # print(len(customer_res)) # len = 4 

        res_df.set_value (i, 'patent_pattern', len(pp_res))
        res_df.set_value (i, 'rd_pattern', len(rdp_res))
        res_df.set_value (i, 'product_pattern', len(prod_res))
        res_df.set_value (i, 'trial_pattern', len(trial_res))
        res_df.set_value (i, 'demo_pattern', len(demo_res))
        res_df.set_value (i, 'manufacturing_pattern', len(manu_res))
        res_df.set_value (i, 'venture_pattern', len(venture_res))
        res_df.set_value (i, 'investment_pattern', len(inv_res))
        res_df.set_value (i, 'university_pattern', len(uni_res))
        res_df.set_value (i, 'partnership_pattern', len(partner_res))
        res_df.set_value (i, 'greenness_pattern', len(green_res))
        res_df.set_value (i, 'customization_pattern', len(customization_res))
        res_df.set_value (i, 'awards_pattern', len(awards_res))
        res_df.set_value (i, 'membership_pattern', len(member_res))
        res_df.set_value (i, 'customer_pattern', len(customer_res))
    

count_regex ()

Working on doc Chevron USA.txt
Working on doc Roche Diagnostics.txt
Working on doc Smart Planet Technologies.txt
Working on doc SRG Global.txt




Working on doc PolyOne.txt
Working on doc Dana.txt
Working on doc Wenger.txt
Working on doc BAXALTA.txt
Working on doc NGK Spark Plug Co.txt
Working on doc Silicor Materials.txt
Working on doc IMEC.txt
Working on doc Ivoclar Vivadent AG.txt
Working on doc Tersus Environmental.txt
Working on doc Honeywell.txt
Working on doc Solar Junction.txt
Working on doc mVerify.txt
Working on doc Raytheon Company.txt
Working on doc Ricoh Company Limited.txt
Working on doc SUMITOMO WIRING SYSTEMS.txt
Working on doc Nova Technologies.txt
Working on doc Winecom.txt
Working on doc Inphenix.txt
Working on doc WAFERTECH.txt
Working on doc Genzyme.txt
Working on doc OFS Fitel.txt
Working on doc Dresser-Rand Company.txt
Working on doc HARMAN INDUSTRIES.txt
Working on doc Alexion Pharmaceuticals.txt
Working on doc Rolls-Royce PLC.txt
Working on doc OBI.txt
Working on doc Sundrop Fuels.txt
Working on doc NEWFIELD THERAPEUTICS.txt
Working on doc J E WHITE.txt
Working on doc Rapamycin Holdings.txt
Working on do

Working on doc The Goodyear Tire & Rubber Company.txt
Working on doc Carver Scientific.txt
Working on doc Mitsubishi Electric.txt
Working on doc GOLBA.txt
Working on doc Novozymes A|S.txt
Working on doc Kinestral Technologies.txt
Working on doc Basell Polyolefine.txt
Working on doc Finisar.txt
Working on doc Amprius.txt
Working on doc Altivera.txt
Working on doc Cleanvantage.txt
Working on doc Babcock Power Services.txt
Working on doc FUJIFILM.txt
Working on doc Lake Lite.txt
Working on doc Aurrion.txt
Working on doc Adtran.txt
Working on doc KR Design House.txt
Working on doc Litron Laboratories Limited.txt
Working on doc FLIR Systems.txt
Working on doc Eastman Chemical Company.txt
Working on doc Johnson Matthey PLC.txt
Working on doc POET Research.txt
Working on doc GE-Hitachi Nuclear Energy Americas.txt
Working on doc TMC.txt
Working on doc Boehringer Ingelheim Vetmedica.txt
Working on doc SixPoint Materials.txt
Working on doc Quantum Materials.txt
Working on doc Tufts Medical Cente

Working on doc Wizard Labs.txt
Working on doc Weatherford Canada Partnership.txt
Working on doc Amberwave.txt
Working on doc St Jude Medical.txt
Working on doc Applied Membrane Technologies.txt
Working on doc UT-Battelle.txt
Working on doc LG Display Co.txt
Working on doc HIQ SOLAR.txt
Working on doc S&S X-Ray Products.txt
Working on doc Agrivida.txt
Working on doc Kimberly-Clark Worldwide.txt
Working on doc Synaptic Research.txt
Working on doc Arcturus Therapeutics.txt
Working on doc Harris.txt
Working on doc Ormat Technologies.txt
Working on doc Proton Power.txt
Working on doc GlassPoint Solar.txt
Working on doc Ortho-Clinical Diagnostics.txt
Working on doc EMD Technologies.txt
Working on doc Northrop Grumman Systems.txt
Working on doc Advanced Water Technology.txt
Working on doc ECOSYNTHETIX.txt
Working on doc SEaB Energy Holdings.txt
Working on doc McElroy Manufacturing.txt
Working on doc Roche Molecular Systems.txt
Working on doc Danisco US.txt
Working on doc One Earth Designs.txt

Working on doc The Samuel Roberts Noble Foundation.txt
Working on doc Coloplast A|S.txt
Working on doc Neural Signals.txt
Working on doc Ford Global Technologies.txt
Working on doc Forest Concepts.txt
Working on doc Formula Plastics.txt
Working on doc CLEARSIGN COMBUSTION.txt
Working on doc TP Solar.txt
Working on doc IDEX Health & Science.txt
Working on doc Soraa.txt
Working on doc MAHLE.txt
Working on doc Siluria Technologies.txt
Working on doc Roll-N-Lock.txt
Working on doc Pacific Biosciences of California.txt
Working on doc Unistrut Corp.txt
Working on doc SAMSUNG DISPLAY CO.txt
Working on doc Canon Kabushiki Kaisha.txt
Working on doc Accuri Cytometers.txt
Working on doc The Henry M Jackson Foundation for the Advancement of Military Medicine.txt
Working on doc Grandis.txt
Working on doc Glucan Biorenewables.txt
Working on doc Northwest Biotherapeutics.txt
Working on doc Total Marketing Services.txt
Working on doc Big Belly Solar.txt
Working on doc NanoOncology.txt
Working on doc P

Working on doc American Science and Technology.txt
Working on doc Agena Bioscience.txt
Working on doc Gilead Connecticut.txt
Working on doc Mattson Technology.txt
Working on doc Quantapore.txt
Working on doc Alnylam Pharmaceuticals.txt
Working on doc Universal Leaf Tobacco Co.txt
Working on doc Renesas Electronics.txt
Working on doc Oculus VR.txt
Working on doc Sequenom.txt
Working on doc AGC Flat Glass North America.txt
Working on doc Pendar Technologies.txt
Working on doc Xintec.txt
Working on doc AbbVie.txt
Working on doc Daylight Solutions.txt
Working on doc CellPrint IP Holding.txt
Working on doc Gram Power.txt
Working on doc Osram Sylvania.txt
Working on doc Columbia Insurance Company.txt
Working on doc Kinetech Power Company.txt
Working on doc Chemtreat.txt
Working on doc Johns Manville.txt
Working on doc Shell Oil Company.txt
Working on doc Landauer.txt
Working on doc Transilwrap Company.txt
Working on doc Poly-Med.txt
Working on doc Pfizer.txt
Working on doc JFE STEEL.txt
Work

Working on doc BASF Plant Science.txt
Working on doc HOWARD INDUSTRIES.txt
Working on doc Evernote.txt
Working on doc Professional Compounding Centers of America.txt
Working on doc Alliance for Sustainable Energy.txt
Working on doc Gracenote.txt
Working on doc Nordic Technologies.txt
Working on doc Hunt Energy Enterprises.txt


In [14]:
res_df['Name'] = res_df['Name'].map(lambda x: x.replace('.txt', ''))

In [18]:
res_df.to_csv (out_file, index=False)
res_df.head(10)

Unnamed: 0,Name,patent_pattern,rd_pattern,product_pattern,trial_pattern,demo_pattern,manufacturing_pattern,venture_pattern,investment_pattern,university_pattern,partnership_pattern,greenness_pattern,customization_pattern,awards_pattern,membership_pattern,customer_pattern
0,Chevron USA,0,59,30,15,24,171,30,54,15,141,51,11,20,39,11
1,Roche Diagnostics,0,1,7,28,29,4,1,0,0,0,0,4,0,2,2
2,Smart Planet Technologies,1,1,1,0,0,11,0,0,0,0,2,0,0,0,0
3,SRG Global,0,9,4,11,6,32,0,0,0,7,2,5,1,0,8
4,PolyOne,1,23,57,14,20,130,15,15,0,43,28,48,6,17,57
5,Dana,3,2,34,9,5,63,3,6,2,27,5,14,15,0,7
6,Wenger,0,0,0,0,0,0,0,0,1,0,0,5,0,1,5
7,BAXALTA,1,24,48,13,11,12,8,11,2,51,12,1,5,6,0
8,NGK Spark Plug Co,0,0,3,0,0,6,0,0,0,4,0,2,0,0,2
9,Silicor Materials,0,11,2,4,2,74,3,20,15,13,13,2,0,1,4
