In [23]:
import spacy
from collections import Counter
import networkx as nx
nlp = spacy.load('en')

In [2]:
def read_paragraphs(fname):
    with open(fname, 'r') as f:
        text = f.read()
    paragraphs = [p for p in text.split('\n\n') if len(p) > 0]
    return paragraphs

trump_par_texts = read_paragraphs('nss/trump_nss.txt')
obama_par_texts = read_paragraphs('nss/obama_nss.txt')
par_texts = trump_par_texts + obama_par_texts
k = len(trump_par_texts)
len(par_texts), len(trump_par_texts), len(obama_par_texts)

(550, 400, 150)

## Extract Subject-Verb-Object Triplets

In [38]:
def noun_verb_pairs(doc):
    nounverbs = list()
    for tok in doc:
        if tok.dep_ == 'ROOT':
            rel = (child_dep(tok,'nsubj'), tok, child_dep(tok,'dobj'))
            nounverbs.append(rel)
    return nounverbs

def child_dep(tok, dep): # gets first child where child.dep_==dep.
    for c in tok.children:
        if c.dep_ == dep:
            return c
    return None

def get_actor_rels(par_texts):
    actor_rels = list()
    for doc in nlp.pipe(par_texts):
        for subj, verb, obj in noun_verb_pairs(doc):
            if subj is not None and subj.ent_type_ != '':
                actor_rels.append( ((subj.text,'subj'), (verb.text,'verb')) )
            if obj is not None:
                actor_rels.append( ((verb.text,'verb'), (obj.text,'obj')) )
    return actor_rels
print(len(actor_rels))
actor_rels[:3]

1066


[(('America', 'subj'), ('is', 'verb')),
 (('America', 'subj'), ('is', 'verb')),
 (('puts', 'verb'), ('America', 'obj'))]

## Create Network From Relations
Here we use the networx directed graph class we create a node for each subject and object, then remove nodes (along with their edges) that don't appear more than once.

In [54]:
def make_actor_network(actor_rels, min_node_ct=5):
    # create network according to actor relations
    G = nx.DiGraph()
    for fro, to in actor_rels:
        if fro[0] not in G.nodes():
            G.add_node(fro[0], typ=fro[1], ct=0)
        if to[0] not in G.nodes():
            G.add_node(to[0], typ=to[1], ct=0)
        edge = (fro[0],to[0])
        if edge not in G.edges():
            G.nodes[to[0]]['ct'] += 1
            G.nodes[fro[0]]['ct'] += 1
            G.add_edge(*edge, weight=0)
        G[edge[0]][edge[1]]['weight'] += 1
        
    # remove nodes that don't meet minimum count threshold
    rm_nodes = list()
    for n in G.nodes():
        if G.nodes[n]['ct'] < min_node_ct:
            rm_nodes.append(n)
    G.remove_nodes_from(rm_nodes)
    len(rm_nodes), len(G.nodes())
        
    return G
len(G.nodes()), len(G.edges())

(65, 97)

(0, 65)