# Defining variables

In [5]:
## define directory path and entity type

my_directory = "/directory/of/file/collection/"
output_loc = "/directory/for/output/csvs/"
ent_type = "PERSON"

### entity type can be "PERSON", "NORP", "ORG", "GPE", etc.
### https://spacy.io/api/annotation#named-entities

# Imports and setup

In [2]:
import spacy
import PyPDF2
import os
import csv
import random
import nltk
import re
import string
import docx2txt
import docx
import codecs
import subprocess
from collections import Counter

nlp = spacy.load('en')

os.chdir(my_directory)

In [6]:
##run if needed
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/erhiggs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Walking directory and compiling text files

In [7]:
allfiles = []
onlypdf = []
onlydoc = []
onlydocx = []
onlytxt = []

for root, dirs, files in os.walk(my_directory):
    for file in files:
        if file.endswith(".doc"):
            allfiles.append(os.path.join(root, file))
            onlydoc.append(os.path.join(root, file))
            
for root, dirs, files in os.walk(my_directory):
    for file in files:
        if file.endswith(".docx"):
            allfiles.append(os.path.join(root, file))
            onlydocx.append(os.path.join(root, file)) 
            
for root, dirs, files in os.walk(my_directory):
    for file in files:
        if file.endswith(".txt"):
            allfiles.append(os.path.join(root, file))
            onlytxt.append(os.path.join(root, file))
            
for root, dirs, files in os.walk(my_directory):
    for file in files:
        if file.endswith(".pdf"):
            allfiles.append(os.path.join(root, file))
            onlypdf.append(os.path.join(root, file))
            
print 'files total:', len(allfiles)
print 'doc:', len(onlydoc)
print 'docx:', len(onlydocx)
print 'pdf:', len(onlypdf)
print 'txt:', len(onlytxt)

# Extract text and compile entities

In [10]:
all_ent = []
pdf_ent = []
doc_ent = []
docx_ent = []
txt_ent = []

## the separate lists aren't necessary for this but keeping them anyway

In [None]:
##ents from pdf

for filename in onlypdf: 
    try:
        pdfFileObj = open(filename, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        for i in range(pdfReader.numPages):
            pageObj = pdfReader.getPage(i)
            pagetext = ((pageObj.extractText()).replace('\n', ' '))
            pdfdoc = nlp(pagetext)
            for ent in pdfdoc.ents:
                entpair = (ent.text, ent.label_)
                pdf_ent.append(entpair)
                all_ent.append(entpair)
    except: 
        pass
    
print 'entities from .pdf:',(len(doc_ent))


In [23]:
##ents from doc

for filename in onlydoc: 
    try:
        cmd = ['antiword', filename]
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        (output, err) = p.communicate()
        uniout = unicode(output, "utf-8")
        doc = nlp(uniout)
        for ent in doc.ents:
            entpair = (ent.text, ent.label_)
            doc_ent.append(entpair)
            all_ent.append(entpair)
    except:
        pass
            
print 'entities from .doc:',(len(doc_ent))


7575666


In [25]:
##ents from docx

for filename in onlydocx: 
    try:
        pagetext = docx2txt.process(filename)
        docxdoc = nlp(pagetext)
        for ent in docxdoc.ents:
            entpair = (ent.text, ent.label_)
            docx_ent.append(entpair)
            all_ent.append(entpair)
    except:
        pass
            
print 'entities from .docx:', (len(docx_ent))

18718


In [11]:
##ents from txt

for filename in onlytxt: 
    try:
        with codecs.open(filename, 'r', encoding='utf-8') as myfile:
            pagetext=myfile.read().replace('\n', ' ')
            txtdoc = nlp(pagetext)
            for ent in txtdoc.ents:
                entpair = (ent.text, ent.label_)
                txt_ent.append(entpair)
                all_ent.append(entpair)
    except:
        pass
            
print 'entities from .txt:', (len(txt_ent))

22473674


In [12]:
print 'all entities:', len(all_ent)

all entities: 22473674


# Specify entity type

In [13]:
entlist = [x for x in all_ent if ent_type in x]
print(len(entlist))

4833073


# Clean output

In [14]:
filter_entlist = [x for x in entlist if (x[0])[0].isalpha() and (x[0])[-1].isalpha()] 

## this will limit list to entities that start and end with an alphanumerical character

print(len(filter_entlist))

4060347


In [15]:
print((list(set(entlist) - set(filter_entlist)))[0:20])

## check this output for data loss

[(u'II l\\_\\  ', u'PERSON'), (u't]\\', u'PERSON'), (u"l'lll\u2018ll\\lllll\xa3", u'PERSON'), (u'                   McGrane', u'PERSON'), (u'the Graham Pa-', u'PERSON'), (u'RETURN  ', u'PERSON'), (u"Ourel netgl'JJUf'", u'PERSON'), (u'Wlll d0', u'PERSON'), (u' Knowland', u'PERSON'), (u'\xa2\u20ac\u20ac4\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac\u20ac        VVVVVVVVVVVVVVVVVVVVVVV     ', u'PERSON'), (u'Hal C. Hart-', u'PERSON'), (u'\\\\L\u2018L\u2018lsetltlst', u'PERSON'), (u'                                                                                                                                                                                                                                                                                                                                                                                                                       ', u'PERSON'), (u'Izar L. H.  ', u'

In [16]:
## filter again to only include names w/ more than 1 word

filter_entlist2 = [x for x in entlist if (x[0])[0].isalpha() and (x[0])[-1].isalpha() and ' ' in x[0]]
print(len(filter_entlist2))

1953791


# Return top entities

In [17]:
os.chdir(output_loc)

namecount = Counter(filter_entlist)
fullnamecount = Counter(filter_entlist2)
commonnames = [x for x in fullnamecount.most_common() if x[1] > 5]
commonall = [x for x in namecount.most_common() if x[1] > 5]

entities_table = []

for name in commonnames:
    row = [(name[0])[0].encode('utf-8'), name[1]]
    entities_table.append(row)

out_path = "entities_fullnames.csv"

header = ['Name', 'Frequency']

with open(out_path, 'w') as fo:
    csv_writer = csv.writer(fo)
    csv_writer.writerow(header)
    csv_writer.writerows(entities_table)
    
entities_table2 = []

for name in commonall:
    row = [(name[0])[0].encode('utf-8'), name[1]]
    entities_table2.append(row)

out_path = "names_all.csv"

header = ['Name', 'Frequency']

with open(out_path, 'w') as fo:
    csv_writer = csv.writer(fo)
    csv_writer.writerow(header)
    csv_writer.writerows(entities_table2)
