In [3]:
# Count people, places, and organizations for comparison with patent data (in R)
# Sanjay K Arora
# October 2019

import pprint
import sys
import pprint
import csv
import pandas as pd
import re
import os
from collections import defaultdict 

In [30]:
from nltk.tag import StanfordNERTagger
import os

os.environ['CLASSPATH'] = "/Users/kg284kt/dev/libs/stanford-ner-2018-10-16/"
os.environ['STANFORD_MODELS'] = "/Users/kg284kt/dev/libs/stanford-ner-2018-10-16/classifiers"

st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP

from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

In [4]:
# set this home data dir
BASE_DATA_DIR = "/Users/kg284kt/dev/EAGER/data/"

in_path = BASE_DATA_DIR + 'orgs/parsed_page_output/' # where in files are stored
out_file = BASE_DATA_DIR + 'analysis/measures/ner_counts_v2.csv'

files = os.listdir(in_path)
files = [item for item in files if item.endswith('.txt')]

pp = pprint.PrettyPrinter()

In [5]:
#generate pooled text and list of processed documents 
combined_text = []
documents_dict = {}
for i in range(len(files)):
    with open(in_path + files[i], 'r') as my_file:
        text = my_file.readlines() # this is each separate paragraph including headers etc.
        for k in ["\n", "\t", " "]: # remove the end of sentence tags
            text = [item.strip(k) for item in text]
        text = list(filter(lambda x : not x in [' ', '', '\t'],text)) #remove the paragraphs that are just blank space
        documents_dict[files[i]] = text #make a list of lists

In [6]:
pp.pprint (documents_dict['PolyOne.txt'][0:10])

['EN',
 'English',
 '简体中文',
 'pусский',
 '日本語',
 '+1-866-737-2066',
 '+1-866-737-2066',
 'COMPANY',
 '|',
 'INVESTORS']


In [7]:
# pattern regex to remove unwanted words that show up in topic models
p = re.compile(r"(\(\)|''|``|\"|null|ul|li|ol|^\.|^:|^/|\\|--|cooki|'s|corpor|busi|inc\.|ltd|co\.|compan|keyboard|product|technolog)", flags=re.IGNORECASE)

def encode_item(text):
    '''
    Remove characters with encoding problems 
    '''
    clean = ""
    for item in text.split(" "):
        try:
            clean += item.encode('ascii','ignore').decode('utf-8') + " "
        except: 
            pass
    return clean.rstrip()

def remove_dups(text):
    '''
    Deal with endcoding and return stemmed/tokenized text
    '''
    seen = {}
    running_text = ''
    for ac in text:
        if ac not in seen:
            seen[ac] = 1
            running_text = running_text + ' ' + ac
        
    return running_text

In [8]:
def process_doc_list(dict_of_docs):
    '''
    input: lists of documents (which are individually lists of paragraphs)
    output: returns a test/train split of paragraphs and stemmed documents
    '''
    combined_text = []
    dict_of_processed_docs = {}
    print ("Working on " + str(len(dict_of_docs.keys())) + " firm documents")

    for key in dict_of_docs:
        document = dict_of_docs[key]
        # print ("Working on doc " + key)

        joined = remove_dups(document)
        
        dict_of_processed_docs[key] = encode_item (joined)
    
    return dict_of_processed_docs

In [9]:
all_docs_dict = process_doc_list (documents_dict)
pp.pprint (all_docs_dict['PolyOne.txt'])

Working on 1142 firm documents
(' EN English  p  +1-866-737-2066 COMPANY | INVESTORS CAREERS GLOBAL US Poly '
 'One PRODUCTS DISTRIBUTION INKS WILFLEX NON-INKS Wilflex Epic Non-Phthalate '
 'Plastisol Inks Wilflex Oasis Water Based Inks Wilflex One Non-Plastisol Inks '
 'Wilflex Originals Wilflex Equipment Solutions Wilflex Compliance Information '
 'ORIGINALS SOLUTIONS INFORMATION SOFTWARE TIPS DISTRIBUTORS ZODIAC AQUARIUS '
 'AQUARIUS INK Aquarius Discharge Ink Aquarius High Mesh Aquarius High Solid '
 'Ink Aquarius Soft Base Ink Aquarius Special Effect Compliance Screen '
 'Printing Inks Distributors AQUARIUS MESH AQUARIUS EFFECT COMPLIANCE '
 'COMPOSITES FIBER-COMPOSITES Short Fiber-Reinforced Thermoplastic Composites '
 'reSound Natural Fiber Reinforced Thermoplastic Composites OnForce Long Fiber '
 'Reinforced Composites TECHNOLOGIES Pultrusion Technologies Continuous '
 'Filament Winding Panels GlasArmor Ballistic Resistant Panels Markers and '
 'Delineators Pullwinding Technolo

In [10]:
res_df = pd.DataFrame (all_docs_dict.keys(),index=(range(len(all_docs_dict.keys()))), columns=["Name"])
res_df['people'] = 0
res_df['places'] = 0
res_df['organizations'] = 0

display (res_df.head())

Unnamed: 0,Name,people,places,organizations
0,Chevron USA.txt,0,0,0
1,Roche Diagnostics.txt,0,0,0
2,Smart Planet Technologies.txt,0,0,0
3,SRG Global.txt,0,0,0
4,PolyOne.txt,0,0,0


In [16]:
print (res_df.shape[0])
print (res_df.iloc[0]['Name'])

1142
Chevron USA.txt


In [34]:
def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag

    return bio_tagged_sent


def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree

def count_entities ():
    regex_matches = {}
    for i in range(0, res_df.shape[0]):
        key = (res_df.iloc[i]['Name'])
        print ("Working on doc " + key)
        document = all_docs_dict[key]
        res = st.tag(document.split())
        
        ne_tree = stanfordNE2tree(res)
        print(ne_tree)
        
#         person_lst = []
#         people_lst = []
#         org_lst = []
#         for (word, entity_type) in res:
#             if entity_type == 'PERSON':
#                 person_lst.append (word)
#         pp.pprint (person_lst)
#         break

#         res_df.set_value (i, 'people', len(people_cnt))
#         res_df.set_value (i, 'places', len(places_cnt))
#         res_df.set_value (i, 'organizations', len(orgs_cnt))
        
count_entities ()

Working on doc Chevron USA.txt
(S
  skip/NN
  to/TO
  main/JJ
  content/NN
  open/JJ
  project/NN
  portfolio/NN
  mobile/NN
  menu/NN
  back/RB
  to/TO
  main/JJ
  menu/NN
  project/NN
  portfolio/NN
  overview/NN
  gorgon/NN
  wheatstone/NN
  jackst./NN
  malo/NN
  tengiz/NN
  expansion/NN
  big/JJ
  foot/NN
  mafumeira/NN
  sul/VBD
  the/DT
  permian/JJ
  basin/NN
  alder/NN
  (LOCATION Angola/NNP)
  LNG/NNP
  operations/NNS
  operations/NNS
  overview/VBP
  exploration/NN
  &/CC
  production/NN
  refining/NN
  transportation/NN
  supply/NN
  &/CC
  trading/NN
  products/NNS
  &/CC
  services/NNS
  lubricants/VBZ
  power/NN
  chemicals/NNS
  &/CC
  additives/NNS
  technology/NN
  technology/NN
  overview/NN
  technology/NN
  and/CC
  crisis/NN
  emerging/VBG
  technologies/NNS
  innovation/NN
  runs/VBZ
  deep/JJ
  managing/VBG
  our/PRP$
  assets/NNS
  drilling/VBG
  enhanced/VBN
  oil/NN
  recovery/NN
  economics/NN
  of/IN
  innovation/NN
  corporate/JJ
  responsibility/NN
  corp

IndexError: string index out of range

In [14]:
res_df['Name'] = res_df['Name'].map(lambda x: x.replace('.txt', ''))

In [18]:
res_df.to_csv (out_file, index=False)
res_df.head(10)

Unnamed: 0,Name,patent_pattern,rd_pattern,product_pattern,trial_pattern,demo_pattern,manufacturing_pattern,venture_pattern,investment_pattern,university_pattern,partnership_pattern,greenness_pattern,customization_pattern,awards_pattern,membership_pattern,customer_pattern
0,Chevron USA,0,59,30,15,24,171,30,54,15,141,51,11,20,39,11
1,Roche Diagnostics,0,1,7,28,29,4,1,0,0,0,0,4,0,2,2
2,Smart Planet Technologies,1,1,1,0,0,11,0,0,0,0,2,0,0,0,0
3,SRG Global,0,9,4,11,6,32,0,0,0,7,2,5,1,0,8
4,PolyOne,1,23,57,14,20,130,15,15,0,43,28,48,6,17,57
5,Dana,3,2,34,9,5,63,3,6,2,27,5,14,15,0,7
6,Wenger,0,0,0,0,0,0,0,0,1,0,0,5,0,1,5
7,BAXALTA,1,24,48,13,11,12,8,11,2,51,12,1,5,6,0
8,NGK Spark Plug Co,0,0,3,0,0,6,0,0,0,4,0,2,0,0,2
9,Silicor Materials,0,11,2,4,2,74,3,20,15,13,13,2,0,1,4
