In [1]:
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("en_core_web_sm")

In [2]:
with open('/Users/paigelee/Desktop/cambridge5/bytes_data.txt', "rb") as f:
    byte_string = f.read()
    
new_doc_bin = DocBin().from_bytes(byte_string)
docs = list(new_doc_bin.get_docs(nlp.vocab))

In [3]:
len(docs), docs[200]

(45822,
 He probably wished to desert but did not wish to give himself up to the allies.)

In [36]:
def valid_to_add(previous_token, token, ent_dict):
    if token.pos_ == 'PROPN':
        if 'PERSON' not in ent_dict[token.text]:
            return False
        else:
            return True
    if token.text.isupper():
        return True
    if token.text == 'I':
        return True
    if previous_token.text in ['Mr.', 'Mr', 'Mrs.', 'Sir']:
        return True 
    
def normalize_initials(PERSON):
    if len(PERSON) == 1 and PERSON.isalpha():
        return PERSON + '.'
    elif PERSON + '.' in proper_nouns.keys():
        return PERSON + '.'
    elif PERSON.endswith(' .'):
        return PERSON.replace(' .', '.')
    elif PERSON.startswith('. '):
        return PERSON.replace('. ', '')
    else:
        return PERSON
    
def invalid_name(PERSON):
    if PERSON[0].islower():
        return True
    if PERSON in ['.', ' ','-']:
        return True
    if PERSON.lower() == "'s":
        return True 
    
def make_entity_dict(docs):
    proper_nouns = dict()
    entity_types = []
    placeholder = nlp('')

    ent_dict = dict()

    for doc in docs:
        for token in doc:
            if token.pos_ == 'PROPN':
                if token.text in ent_dict.keys():
                    ent_dict[token.text].add(token.ent_type_)
                else:
                    ent_dict[token.text] = {token.ent_type_}

    for doc in docs:
        propn_found = []
        propn_seq = []
        previous_token = placeholder
        for token in doc:
            if valid_to_add(previous_token, token, ent_dict):
                propn_seq.append((token.text, token.pos_, token.ent_type_))
            else:
                if propn_seq != []:
                    propn_found.append(propn_seq)
                    propn_seq = []

            previous_token = token

            people = []
            for propn_entry in propn_found:
                if len(propn_entry) > 1:
                    names = []
                    for name, pos, ent in propn_entry:
                        names.append(name)
                    people.append(' '.join(names))

                else:
                    people.append(propn_entry[0][0])

        # add normalized name to dictionary
        for PERSON in people:
            if invalid_name(PERSON):
                break

            PERSON = normalize_initials(PERSON)

            if PERSON not in proper_nouns.keys():
                proper_nouns[PERSON] = [doc]
            else:
                proper_nouns[PERSON].append(doc)
    
    not_enough_occurrences = []
    for PERSON in proper_nouns.keys():
        if len(proper_nouns[PERSON]) < 5:
            not_enough_occurrences.append(PERSON)
    for PERSON in not_enough_occurrences:
        del proper_nouns[PERSON]

    print('# of proper nouns found:', len(proper_nouns))
    return proper_nouns

In [37]:
entity_dict = make_entity_dict(docs)

# of proper nouns found: 1216


In [42]:
lst = []
for ent in entity_dict.keys():
    lst.append((ent, len(entity_dict[ent]), entity_dict[ent]))
sorted_by_name_proper_nouns = sorted(lst)
for PERSON, NUM_ENTRIES, LST in sorted_by_name_proper_nouns:
    print(PERSON, NUM_ENTRIES)

0.C. 8
1.B. 6
18A. 6
18B. 7
A Jap B.J. 6
A. 642
A. A. 12
A.A. 29
A.D.N.I. 11
A.G. 15
A.L.O.s 6
A.M. 26
ANDREWS 5
ARNHEIM 10
AS 7
Abbott 16
Abt 28
Abwehr 275
Adam 66
Adams 7
Ahmed 6
Air 280
Air Attache 12
Air Line 11
Air Marshal 6
Airy 14
Alacrity 10
Alamein 7
Albert 6
Alcazar 6
Alec 5
Alec Kellar 5
Alexander 25
Alexander Maxwell 7
Aliens 17
Aliens Branch 10
Allan 18
Allen 6
Ally 6
America 193
American 55
American Embassy 13
Amt. 19
Anderson 26
Anna WOLKOFF 8
Anthony 28
Antony 14
April 166
Archer 108
Archie 9
Archie Boyle 19
Armies 10
Army 249
Army Group 39
Army Signals 5
Arnold 63
Arnold Foster 15
Arras 11
Art Thurston 5
Arthur 9
Asst 15
Ast 7
Aston 6
Attache 24
Attaches 5
Austrian 18
Axis 29
B. 138
B. Division 47
B. Meeting 7
B.1 11
B.1 Reg 6
B.1A 12
B.3. 13
B.4. 8
B.5 14
B.6 16
B.B.C. 8
B.Branch 17
B.J. 18
B.U.F. 5
BEDAUX 27
BEPPU 7
BERTRAND 6
BING 5
BIRKIGT 5
BLACKETT 5
BODDINGTON 5
BODE 7
BRODERSEN 12
BRYANS 6
BUNSEN 7
Bad Nenndorf 10
Badoglio 20
Ball 5
Balloon 21
Bamford 17
Barbar

In [46]:
lst = []
for ent in entity_dict.keys():
    lst.append((len(entity_dict[ent]), ent, entity_dict[ent]))
sorted_by_count_proper_nouns = sorted(lst, reverse = True)
for NUM_ENTRIES, PERSON, LST in sorted_by_count_proper_nouns[:50]:
    print(NUM_ENTRIES, PERSON)

7970 I.
1224 Germans
765 D.G.
642 A.
625 Germany
554 C.
514 Dick
373 H.O.
370 S.I.S.
353 London
324 Lisbon
301 Eire
286 France
282 French
280 Air
275 Abwehr
262 Committee
249 Army
247 Govt.
235 S.
230 Security
230 Berlin
221 TAR
218 F.O.
207 N.
200 Maxwell
193 Curry
193 America
189 Minister
184 Burt
181 LRC
176 J.I.C.
175 SHAEF
175 East
166 War
166 April
164 Duff
163 Embassy
161 P.M.
156 Stephenson
154 June
154 Felix
151 Paris
147 Hitler
143 M.I.5.
142 Palestine
139 Front
139 England
138 D.S.S.
138 B.


In [47]:
# check for members of spy ring

In [None]:
cambridge_5 = ['Kim Philby', '']