In [None]:
# pip install spacy

In [2]:
# import spacy and load English language model
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("en_core_web_sm")

**Reading in bytes_data.txt**

bytes_data.txt is a binary file that contains a list of spacy docs

spacy docs are containers for accessing linguistic annotations (https://spacy.io/api/doc)

In [3]:
# read in the pre-processed file bytes_data.txt
    # this will take a second
with open('/Users/paigelee/Desktop/cambridge5/bytes_data.txt', "rb") as f:
    byte_string = f.read()
    
new_doc_bin = DocBin().from_bytes(byte_string)
# create docs, a list of spacy docs
docs = list(new_doc_bin.get_docs(nlp.vocab))

print('# of docs:', len(docs))
print('example doc:', docs[200])

# of docs 45822
example doc: He probably wished to desert but did not wish to give himself up to the allies.



**Extracting entities**

Helper functions for the make_entity_dict() function

In [90]:
# helper functions
def make_is_person_dict(docs):
    """makes dictionary where keys are proper nouns
    and values are bools of whether or not it is labeled PERSON by Spacy"""
    # intialize entity type dict
    is_person_dict = dict()
    # initialize all proper nouns to False
    for doc in docs:
        for token in doc:
            if token.pos_ == 'PROPN':
                is_person_dict[token.text] = False   
    # set True for entities labeled as PERSON by Spacy
    for doc in docs:
        for token in doc:
            if token.pos_ == 'PROPN' and token.ent_type_ == 'PERSON':
                is_person_dict[token.text] = True
    return is_person_dict

def valid_to_add(previous_token, token, is_person_dict):
    """helps determines whether a token is valid to add as an entity (or prefix to entity)"""
    # token is a proper noun and is labeled as a PERSON type entity at least once
    if token.pos_ == 'PROPN':
        if is_person_dict[token.text]:
            return True
    # token is of all uppercase form (like ARTIST or ZIGZAG)
    if token.text.isupper():
        return True
    # we count I (not a proper noun) as an entity because we want 
        # the connections between the author and the other people
    if token.text == 'I':
        return True
    # if the previous token is a person's title, the current token should be an entity
    if previous_token.text in ['Mr.', 'Mr', 'Mrs.', 'Sir']:
        return True 
    else:
        return False
    
def normalize_name(PERSON, entity_dict):
    """normalizes entity name"""
    # corrects D.G to D.G.
    if PERSON + '.' in entity_dict.keys():
        return PERSON + '.'
    # corrects A to A. (if A. is not already in the dictionary)    
    elif len(PERSON) == 1 and PERSON.isalpha():
        return PERSON + '.'
    # corrects C.S.S . to C.S.S.
    elif PERSON.endswith(' .'):
        return PERSON.replace(' .', '.')
    # corrects . U.S. to U.S.
    elif PERSON.startswith('. '):
        return PERSON.replace('. ', '')
    # else no change to entity name
    else:
        return PERSON
    
def invalid_name(PERSON):
    """gets rid of invalid type names"""
    # invalid name if starts with lowercase letter
    if PERSON[0].islower():
        return True
    # if the 'entity' captured is actually just a punctuation mark or space
    if PERSON in ['.', ' ','-']:
        return True
    # if the 'entity' cpatured is just a trailing "'s"
    if PERSON.lower() == "'s":
        return True 

**Making the entity dictionary**

Creating an entity dictionary where the keys are entities like Kim Philby, CHRUCHILL, D.G. 

and the values are lists of sentences (actually Spacy docs) that include the entity.

In [94]:
def make_entity_dict(docs, MIN_NUM):
    """makes a dictionary with entities as keys and 
    docs as values (that include the given entity)"""
    
    # initialize entity type dict
    is_person_dict = make_is_person_dict(docs)
    # initialize the entity dict (output dict)
    entity_dict = dict()
    
    for doc in docs:
        # sequences_found_in_sentence is a list of proper noun sequences found in a doc
            # [[(text, pos, ent_type),(),()], propn_seq, propn_seq, ...]
        sequences_found_in_sentence = []
        # prop_seq captures each sequence of proper nouns
            # for example, [('Ted', 'PROPN', 'PERSON'),('Johnson', 'PROPN', 'PERSON')]
            # often, len(prop_seq) == 1 because many proper nouns are just one token long
        propn_seq = []
        # set placeholder value for first iteration of previous_token per doc
        previous_token = nlp('')
        # loop over tokens in doc and check if they are valid to add to a proper noun sequence
        for token in doc:
            if valid_to_add(previous_token, token, is_person_dict):
                propn_seq.append((token.text, token.ent_type_))
            else:
                # once you hit a token that is NOT a proper noun, 
                    # add cumulative propn_seq to sequences_found_in_sentence
                if propn_seq != []:
                    sequences_found_in_sentence.append(propn_seq)
                    propn_seq = []
            # set token to use in next valid_to_add()
            previous_token = token

        people = []
        # create people, a list of names based on proper noun sequences in the sentence
        for propn_seq in sequences_found_in_sentence:
            # if the current proper noun sequence includes more than one token
                # construct a compound proper noun name
            if len(propn_seq) > 1:
                names = []
                for name, ent in propn_seq:
                    names.append(name)
                people.append(' '.join(names))
            # if there is only one proper noun in the sequence, add that 
            else:
                people.append(propn_seq[0][0])

        # add normalized name to dictionary
        for PERSON in people:
            if invalid_name(PERSON):
                break
            # normalize the name based on spelling rules
            PERSON = normalize_name(PERSON, entity_dict)
            # add name and doc to dictionary                
            if PERSON not in entity_dict.keys():
                entity_dict[PERSON] = [doc]
            else:
                entity_dict[PERSON].append(doc)
                
    # check if the PERSON occurs MIN_NUM or more times
    not_enough_occurrences = []
    for PERSON in entity_dict.keys():
        # create list of entities with too few occurrences
        if len(entity_dict[PERSON]) < MIN_NUM:
            not_enough_occurrences.append(PERSON)
    # delete entity from dictionary if it doesn't occur enough times
    for PERSON in not_enough_occurrences:
        del entity_dict[PERSON]
    print('entity dict succesfully created.\n')
    print('# of proper nouns found:', len(entity_dict))
    print('\nusage example:\n\tentity_dict["Philby"][0] =', entity_dict['Philby'][0])
    return entity_dict

In [95]:
# will probably take 15-30 seconds to run
entity_dict = make_entity_dict(docs, 5)

entity dict succesfully created.

# of proper nouns found: 1476

usage example:
	entity_dict["Philby"][0] = We have received a letter from Philby showing the action of the Portuguese authorities on the representations made by us about German in espionage in Portugal.



**Creating a list of entities sorted in alphabetical order**

Showing the first 20, but can be accessed by entities_sorted_by_name

In [96]:
lst = []
for ent in entity_dict.keys():
    lst.append((ent, len(entity_dict[ent]), entity_dict[ent]))
entities_sorted_by_name = sorted(lst)
for PERSON, NUM_ENTRIES, LST in entities_sorted_by_name[:20]:
    print(PERSON, NUM_ENTRIES)

0.C. 8
0.S.S. 5
1.B. 5
18B. 24
A Jap B.J. 5
A. 625
A. A. 12
A.A. 30
A.B. 7
A.C.E. 9
A.D.N.I. 11
A.G. 15
A.G.3 6
A.L.O. 11
A.L.O.s 6
A.M. 24
A.W.S. 5
ADNI 11
ALCAZAR 9
ALEXANDER 7


**Creating list of entities sorted in descending count order**

Showing the first 20, but can be accessed by entities_sorted_by_count

In [100]:
lst = []
for ent in entity_dict.keys():
    lst.append((len(entity_dict[ent]), ent, entity_dict[ent]))
entities_sorted_by_count = sorted(lst, reverse = True)
for NUM_ENTRIES, PERSON, LST in entities_sorted_by_count[:20]:
    print(NUM_ENTRIES, PERSON)

7946 I.
1222 Germans
765 D.G.
625 A.
624 Germany
545 C.
510 Dick
411 SIS
377 H.O.
363 S.I.S.
352 London
324 Lisbon
301 Eire
286 France
281 French
280 Air
271 Abwehr
246 Army
239 W.
239 Committee


In [47]:
# check for members of spy ring

In [None]:
cambridge_5 = ['Kim Philby', '']