In [6]:
import lucem_illud #pip install -U git+git://github.com/Computational-Content-Analysis-2018/lucem_illud.git

#All these packages need to be installed from pip
#For NLP
import nltk
import scipy #For divergences/distances
import seaborn as sns #makes our plots look nicer
import sklearn.manifold #For a manifold plot
from nltk.corpus import stopwords #For stopwords

import numpy as np #For arrays
import pandas #Gives us DataFrames
import matplotlib.pyplot as plt #For graphics

#Displays the graphs
import graphviz #You also need to install the command line graphviz

#These are from the standard library
import os.path
import zipfile
import subprocess
import io
import tempfile

#additionally...
import chardet

%matplotlib inline

In [2]:
lucem_illud.setupStanfordNLP()


Starting downloads, this will take 5-10 minutes
../stanford-NLP/parser already exists, skipping download
../stanford-NLP/ner already exists, skipping download
../stanford-NLP/postagger already exists, skipping download
../stanford-NLP/core already exists, skipping download
Done setting up the Stanford NLP collection


In [3]:
import lucem_illud.stanford as stanford

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


In [4]:
DOSSpeeches = '/Users/Enya/Desktop/DOS_Speeches.csv'
DOSSpeechesDF = pandas.read_csv(DOSSpeeches, encoding='Latin-1')
print (DOSSpeechesDF)

        date                                              title  \
0    12/1/16  Preventing the Exploitation of Information and...   
1   10/10/16  Keynote Address at the Singapore International...   
2    9/28/16  The Persistent Threat of North Korea and Devel...   
3    9/19/16  Statement Before the Presidential Commission o...   
4     6/3/16                                   TEDx Tysons Talk   
5    5/25/16  International Cybersecurity Strategy: Deterrin...   
6     3/1/16  Remarks by Attorney General Loretta E. Lynch a...   
7    2/26/16  Inter-American Committee Against Terrorism (CI...   
8    2/11/16  The New Face of Terrorism: Countering Violent ...   
9    7/29/15  Remarks for Panel Session "Development of Cybe...   
10    5/4/15  Department of Commerce Cybersecurity Trade Mis...   
11   2/24/15                      New Reward for Cyber Fugitive   
12   1/13/15  The North Korean Threat: Nuclear, Missiles and...   
13    3/4/14  As Prepared Remarks at Georgetown University I..

In [5]:
DOSSpeechesDF['tokenized-text'] = DOSSpeechesDF['text'].apply(lambda x: nltk.word_tokenize(x))
DOSSpeechesDF['sentences'] = DOSSpeechesDF['text'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])
DOSSpeechesDF['POS-sents'] = DOSSpeechesDF['sentences'].apply(lambda x: stanford.postTagger.tag_sents(x))

In [7]:
countTarget = 'NN'
targetCounts = {}
for entry in DOSSpeechesDF['POS-sents']:
    for sentence in entry:
        for ent, kind in sentence:
            if kind != countTarget:
                continue
            elif ent in targetCounts:
                targetCounts[ent] += 1
            else:
                targetCounts[ent] = 1
sortedTargets = sorted(targetCounts.items(), key = lambda x: x[1], reverse = True)
sortedTargets[:25]

[('cyber', 265),
 ('cyberspace', 202),
 ('law', 145),
 ('security', 123),
 ('world', 84),
 ('policy', 83),
 ('information', 82),
 ('internet', 73),
 ('', 71),
 ('behavior', 69),
 ('space', 66),
 ('today', 64),
 ('cooperation', 63),
 ('state', 63),
 ('stability', 63),
 ('capacity', 56),
 ('technology', 52),
 ('Internet', 51),
 ('work', 49),
 ('consensus', 48),
 ('development', 48),
 ('time', 46),
 ('role', 45),
 ('building', 45),
 ('cybersecurity', 45)]

In [8]:
countTarget = 'VB'
targetCounts = {}
for entry in DOSSpeechesDF['POS-sents']:
    for sentence in entry:
        for ent, kind in sentence:
            if kind != countTarget:
                continue
            elif ent in targetCounts:
                targetCounts[ent] += 1
            else:
                targetCounts[ent] = 1
sortedTargets = sorted(targetCounts.items(), key = lambda x: x[1], reverse = True)
sortedTargets[:25]

[('be', 166),
 ('do', 52),
 ('build', 49),
 ('promote', 46),
 ('continue', 45),
 ('have', 39),
 ('work', 34),
 ('take', 34),
 ('make', 34),
 ('ensure', 33),
 ('Thank', 30),
 ('address', 29),
 ('engage', 26),
 ('help', 22),
 ('strengthen', 20),
 ('thank', 19),
 ('protect', 18),
 ('bring', 18),
 ('achieve', 18),
 ('apply', 18),
 ('play', 17),
 ('develop', 17),
 ('reduce', 17),
 ('see', 16),
 ('counter', 15)]

In [9]:
NTarget = 'JJ'
Word = 'cyberspace'
NResults = set()
for entry in DOSSpeechesDF['POS-sents']:
    for sentence in entry:
        for (ent1, kind1),(ent2,kind2) in zip(sentence[:-1], sentence[1:]):
            if (kind1,ent2.lower())==(NTarget,Word):
                NResults.add(ent1)
            else:
                continue

print(NResults)

{'mainstream', 'secure', 'peaceful', 'domestic', 'stable', 'military', 'key', 'reliable', 'international', 'accessible'}


In [10]:
NTarget = 'JJ'
Word = 'cyber'
NResults = set()
for entry in DOSSpeechesDF['POS-sents']:
    for sentence in entry:
        for (ent1, kind1),(ent2,kind2) in zip(sentence[:-1], sentence[1:]):
            if (kind1,ent2.lower())==(NTarget,Word):
                NResults.add(ent1)
            else:
                continue

print(NResults)

{'global', 'large-scale', 'states/International', 'hostile', 'complicated', 'significant', 'new', 'fundamental', 'cross-cutting', 'first', 'joint', 'bilateral', 'reliable', 'malicious', 'regional', 'big', 'transnational', 'potential', 'future', 'voluntary', 'whole-of-government', 'serious', 'sub-regional', 'sophisticated', 'foreign', 'recent', 'major', 'offensive', 'particular', 'specific', 'coercive', 'specialized', 'practical', 'other', 'international'}


In [11]:
NTarget = 'NN'
Word = 'security'
NResults = set()
for entry in DOSSpeechesDF['POS-sents']:
    for sentence in entry:
        for (ent1, kind1),(ent2,kind2) in zip(sentence[:-1], sentence[1:]):
            if (kind1,ent2.lower())==(NTarget,Word):
                NResults.add(ent1)
            else:
                continue

print(NResults)

{'document', 'aviation', 'chain', 'cyber', 'space', 'network', 'computer'}


In [12]:
NTarget = 'JJ'
Word = 'security'
NResults = set()
for entry in DOSSpeechesDF['POS-sents']:
    for sentence in entry:
        for (ent1, kind1),(ent2,kind2) in zip(sentence[:-1], sentence[1:]):
            if (kind1,ent2.lower())==(NTarget,Word):
                NResults.add(ent1)
            else:
                continue

print(NResults)

{'economic', 'shared', 'knowledgeable', 'national', 'multinational', 'hemispheric', 'private-sector', 'new', 'trilateral', 'regional', 'cyber', 'international'}
