In [None]:
'''
The code below is processing of the 453 SEC55 Trump Corpus
Code below will be from:
https://pynlp.wordpress.com/2014/02/03/7-corpus-processing/ 
http://www.nltk.org/book/ch01.html#sec-computing-with-language-simple-statistics
Natural Language Processing text by Byrd et al
'''

In [None]:
#in this example, all the DSIs are in plain text in a folder called "Donald_Trump"
#this will import the plain text reader from nltk to read all txt files in the folder

#from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

from nltk.corpus import PlaintextCorpusReader
nltk.download('punkt')

#this tells reader to find .txt files and is called later when calling the reader
DOC_PATTERN= r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'

#this specifies the category patter (ie: if there were more than one folder or
#category other than just Donald_Trump in this example)
#use this if you are using CategoryPlaintTextCorpusReader
#CAT_PATTERN =r'([\w_\s]+)/.*'

#provide path to partent folder where each corpus folder is located
#in this example, there is only one folder "Donald_Trump" so that's 
#all it will find.
corpus = PlaintextCorpusReader(
    '../Class Corpus/Corpus', DOC_PATTERN)

#note the '../' tells it to start with where the python code is stored
#so you don't have to write full path

In [None]:
corpus

In [None]:
#reads all the file names in the corpus you specified
corpus_list = corpus.fileids()
corpus_list

#or to print:
# for f in corpus.fileids():
#     print (f)

#this will index a specific corpus text
#corpus.fileids()[0]



In [None]:
#if you want just the first part of a fileid name
[fileid[:21] for fileid in corpus.fileids()]

In [None]:
#one method to pull a specific DSI from the corpus
corpus.raw('Donald_Trump/AC_Doc1_Trump-Trade-S-Korea.txt')

In [None]:
#get a specified list of characters from the raw corpus
corpus.raw('Donald_Trump/PKC_Doc1_US-Canada-NAFTA.txt')[1:20]

In [None]:
#loop through each document and print the first 10 words
for id in corpus.fileids():
    print(corpus.words(id)[1:10])

In [None]:
#paras, sents, and words are functions specified in CategorizedPlaintextCorpusReader

para_list = corpus.paras()
sents_list = corpus.sents()
word_list = corpus.words()

In [None]:
#num paragraphs, num sentences, num words
len(para_list), len(sents_list), len(word_list)

In [None]:
para_list_s = corpus.paras('Donald_Trump/AC_Doc1_Trump-Trade-S-Korea.txt')
sents_list_s = corpus.sents('Donald_Trump/AC_Doc1_Trump-Trade-S-Korea.txt')
word_list_s = corpus.words('Donald_Trump/AC_Doc1_Trump-Trade-S-Korea.txt')

In [None]:
len(para_list_s), len(sents_list_s), len(word_list_s)

In [None]:
for fileid in corpus.fileids():
    num_chars = len(corpus.raw(fileid))
    num_words = len(corpus.words(fileid))
    num_sents = len(corpus.sents(fileid))
    num_vocab = len(set([w.lower() for w in corpus.words(fileid)]))
    
    #print: total numb characters, total numb words, total numb sentences, total vocabulary
    #print: av word length, av words per sentence, av numb times each vocab item appears in the text
    print (int(num_chars), int(num_words), int(num_sents), int(num_vocab),
           int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab),fileid)

In [None]:
two_ids = corpus.words(fileids=['Donald_Trump/PKC_Doc1_US-Canada-NAFTA.txt',
                               'Donald_Trump/PKC_Doc1_US-Canada-NAFTA.txt'])
two_ids, len(two_ids)

In [None]:
##
############## working with paragraphs, sentences, words, raw  ##############



In [None]:
#prints every paragraph in the entire corpus
for p in para_list:
    print (p)
    
raw_input() #this waits to run until you hit enter

In [None]:
#prints every sentence in the entire corpus
for s in sents_list:
    print (s)
    
raw_input()  #this waits to run until you hit enter

In [None]:
#print every word in the entire corpus
for w in word_list:
    print (w)
    
raw_input()

In [None]:
#produce the entire corpus
corpus_raw = corpus.raw()

#this will create a string with contents of entire corpus
corpus_raw

#this will print it and it will appear like it does in txt file
#print (corpus_raw)

In [None]:
##
######### operating on every element ############

#only gets length for first 20 words
#[len(w) for w in corpus.words('Donald_Trump/TAC_Doc2_Kavanaugh_Confirmation_Vote.txt')[0:20]]

"""
same as above but forces lower case for all words note that its important 
to use the one below because it eliminates python from double counting words 
due to capitalizaiton, ie: This and this are counted separately
"""
[len(w.lower()) for w in corpus.words('Donald_Trump/TAC_Doc2_Kavanaugh_Confirmation_Vote.txt')[0:20]]

##references the object defined above for the specified document
#[len(w) for w in word_list_s[0:20]]

In [None]:
#example on the importance of using .lower referenced above
"""
NOTE: not applicable for PlaintextCorpusReader.  Use this to check
if you are using not using NLTK corpus reader
you will see that the length of the corpus is larger for the 
option where words aren't forced to lower case
"""

len(corpus.words()), len([w.lower() for w in corpus.words()])

In [None]:
#repeat example above but with sentences
[len(s) for s in corpus.sents('Donald_Trump/TAC_Doc2_Kavanaugh_Confirmation_Vote.txt')[0:20]]

##references the object defined above for the specified document
#[len(s) for s in sent_list_s[0:20]]

In [None]:
##
########## freq analaysis ##############


In [None]:
#this shows how to get the length of each word in a document
#its only for illustration since freqdist (shown later) does this more efficiently

len_test = [len(w) for w in word_list]

len_test2 = [len(w) for w in corpus.words('Donald_Trump/TAC_Doc2_Kavanaugh_Confirmation_Vote.txt')]

len_test[0:10], len_test2[0:10]

#note: the first two are the same because the first character measured is an apostrophe and
#for these two documents, the next word is 'president' and 'kavanaugh' = both 9 letters
#so just coincidence

In [None]:
from nltk.probability import FreqDist

#to get the freq of words in entire corpus (not lowercase)
fdist_a = nltk.FreqDist(corpus.words())

#to get the freq of a single text in a specific DSI (not lowercase)
fdist_b = nltk.FreqDist(corpus.words('Donald_Trump/AC_Doc1_Trump-Trade-S-Korea.txt'))

#Note: word_list is defined above
fdist = nltk.FreqDist([w.lower() for w in word_list])

#print(fdist) prints a summmary, while fdist prints the tuple with word and freq

print(fdist), fdist

#print(fdist_a), fdist_a
#print(fdist_b), fdist_b

In [None]:
#get top n most common (or leave () blank and it defaults to 100 possibly?)
fdist.most_common(20)

In [None]:
#get the word that is most freq, get the freq of the most common word
fdist.max(), fdist['the']

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

#print the 20 most common terms
"""
see that most are stop words (except for Trump) which is probably 
so common that its also not very useful
"""
fdist_plot = fdist.plot(20, cumulative=False)

In [None]:
#printing here but specifying specific words of interest

#Note: word_list is defined above
fdist_modal = nltk.FreqDist([w.lower() for w in word_list])
modals = ['false', 'lie', 'accurate', 'falsifying', 'President', 'president']
for m in modals:
    print (m + ':', fdist[m])

In [None]:
#another way to do the above but using tokenize instead of corpusplaintextreader
from nltk.probability import FreqDist
words = nltk.tokenize.word_tokenize(corpus.raw())
fdist = FreqDist(words)

In [None]:
##
###############  filter for words based on length #######################


In [None]:
long_words = [w for w in corpus.words('Donald_Trump/PKC_Doc1_US-Canada-NAFTA.txt') if len(w)>10]

long_words

In [None]:
#find words in different documents that are based on specified criteria
#see lines below for example on FreqDist

txt1 = FreqDist(corpus.words('Donald_Trump/PKC_Doc1_US-Canada-NAFTA.txt'))
sorted(w for w in corpus.words('Donald_Trump/PKC_Doc1_US-Canada-NAFTA.txt') 
       if len(w)>2 and txt1[w]>5)

In [None]:
############## find words based on conditions ########

"""
These are common word comparison operators:

s.startswith(t) test if s starts with t
s.endswith(t) test if s ends with t
t in s test if t is a substring of s
s.islower() test if s contains cased characters and all are lowercase
s.isupper() test if s contains cased characters and all are uppercase
s.isalpha() test if s is non-empty and all characters in s are alphabetic
s.isalnum() test if s is non-empty and all characters in s are alphanumeric
s.isdigit() test if s is non-empty and all characters in s are digits
s.istitle() test if s contains cased characters and is titlecased 
                (i.e. all words in s have initial capitals)
"""

In [None]:
sorted(w for w in corpus.words('Donald_Trump/PKC_Doc1_US-Canada-NAFTA.txt')
      if w.endswith('ing'))

In [None]:
#use conditional or 
sorted(w for w in corpus.words()
      if w.endswith('ible') or w.endswith('ious'))

In [None]:
#another option
for suf in corpus.words():
    if suf.endswith('ious'):
        print(suf)

In [None]:
tricky = sorted(w for w in corpus.words() if 'cie' in w or 'cei' in w)
tricky

# #option to print if you prefer
# for word in tricky:
#     print(word, end=" ")

In [None]:
#################################################################
#
#   Start Pre-Processing
#
#################################################################

In [None]:
###
############# Use Dictionary to replace terms ################

In [None]:
#Enter EC dictionary for corpus DSIs to replace tokens
#note: prob more efficient to read in txt file rather than paste is all here.

Trump_dictionary = {
	'donaldtrump': ['president trump', 'president donald trump', 'donald trump'
		'mr trump', 'trumps', 'trump organization', 'donald j trump'],
	'trump administration': ['senior trump administration officials', 'the white house',
		'trump administration', 'trump administration officials', 'federal government', 'us official',
		'us officials', 'us government', 'donald trumps top economic advisors', 'white house sources',
		'state department official', 'the official', 'washington', 'washingtons', 'the administration',
		'white house source', 'administration officials', 'donald trumps top economic advisor', 
            'white house counsel', 'national security advisor', 'secretary of state', 
            'chairman of white house council of economic advisers', 'white house press secretary',
            'press secretary', 'agriculture secretary'],
	'donaldmcgahan': ['donald f mcgahan II', 'don mcgahn', 'don f mcgahn', 'homeland security secretary'],
	'johnbolton': ['mr bolton', 'john bolton', 'bolton'],
	'mikepompeo': ['mike pompeo', 'pompeo'],
	'kirstjennielsen': ['kirstjen nielsen'],
	'kevenhassett': ['hassett'],
	'sarahsanders': ['sarah huckabee sanders', 'sanders'],
	'sonnyperdue': ['sonny perdue','mr perdue', 'mr perdues'],
	'mitchmcconnell': ['mitch mcconnell', 'mcconnell', 'senator mitch mcconnell',
		'majority leader', 'mr mcconnell'],
	'jeffflake': ['senator jeff flake', 'sen jeff flake'],
	'chuckgrassley': ['grassley', 'sen grassley', 'senator grassley'],
	'susancollins': ['sen susan collins', 'r-maine', 'senator collins', 'senator susan collins'],
	'berniesanders': ['sen bernie sanders', 'senator bernie sanders'],
	'chuckschumer': ['democratic majority leader', 'democratic majority leader chuck schumer'],	
	'brettkavanaugh': ['kavanaugh', 'supreme court pick', 'supreme court nominee', 'judge brett m kavanaugh',
		'brett m kavanaugh', 'mr kavanaugh', 'president trumps nominee', 'nominee for the supreme court',
		'judge kavanaugh', 'supreme court nominee brett kavanaugh', 'brett', 'judge brett kavanaugh'],
	'brettkavanaughaccuser': ['deborah ramirez', 'christine blasey ford', 'charles ludington',
		'julie swetnick'],
	'brettkavanaughfriend': ['mark judge', 'pj smyth'],
	'christineblaseyford': ['dr ford', 'christine ford', 'ford'],
	'stephanieclifford': ['ex porn star stormy daniels', 'stormy daniels', 'clifford', 'ms daniels'],
	'michaelcohen': ['a lawyer for donald trump', 'cohen'],
	'melaniatrump': ['third wife melania', 'first lady', 'melania'],
	'georgewbush': ['george w bush', 'george bush'],
	'barackobama': ['obama administration', 'former president'],
	'chuckhagel': ['former defense secretary chuck hagel', 'hagel', 
                   'defense secretary for president barack obama'],
	'hillaryclinton': ['hillary', 'hillary clinton', 'hrc'],
	'johnpodestra': ['hillary clintons campaign chairman', 'hilaryclinton campaign chairman', 'podestra'],
	'julianassange': ['wikileaks founder julian assange', 'assange'],
	'robertmueller': ['mueller', 'special prosecutor'],
	'moonjae-in': ['moon jae-in', 'moon jaein', 'moon jae in' 'south korean president', 'moon'],
	'kimjong un': ['jong un'],
	'sebastianpinera': ['sebastian pinera'],
	'basharal-assad': ['assad', 'bashar al-assad',' bashar al assad', 'bashar alassad' 'al-assads',
                      'al assads', 'alassads'],
	'vladimirputin': ['vladimir putin', 'putin'],
	'shinzoabe': ['prime minister of japan', 'abe', 'shinzo'],
	'xijinping': ['president of china', 'xi'],
	'andresmanuallopezobrador': ['andrew manual lopez obrandor', 'mexicos president elect', 'lopez obrador'],
	'justintrudeau': ['canadian prime minister', 'justin trudeau', 'mr trudeau'],
	'enriquepenanieto': ['enrique pena nieto'],
	'hassanrouhani': ['hassan rouhani', 'rouhani'],
	'senate judiciary': ['senate judiciary committee', 'judiciary committee'],
	'senate':['senator', 'united states senate', 'senate floor', 'senate majority leader','sen'],
	'congress': ['lawmakers', 'congressional'],
	'house of representatives': ['the house'],
    'republican party': ['gop', 'conservatives', 'us republicans', 'republicans',
		'republican colleagues', 'republican'],
	'democratic party': ['democrat', 'us democrats', 'democrats'],
	'united states': ['us', 'u.s.', 'united states of america', 'american', 'americans',
		'americas'],
	'african americans': ['black workers'],
	'china': ['chinese', 'chinese government'],
	'syria': ['syrians'],
	'iran': ['iranians', 'iranian', 'tehran', 'basra', 'ayatollah'],
	'russia': ['russians', 'moscow', 'russian government', 'russian intelligence'],
	'european union': ['eu'],
	'canada': ['canadian'],
	'e3': ['european signatories', 'germany, france, and the united kingdom'],
	'opec': ['organization of the petroleum exporting countries', 'opec nations', 
		'15-member Organization of the Petroleum Exporting Countries', 'opec source',
		'opec nations', 'mohammad sanusi barkindo, opecs sercurity general',
		'hossein kazempour ardebili irans govenor to opec'],
	'world trade organization': ['wto', 'world trade organization wto'],
	'council on foreign relations': ['lorand laskai'],
	'united nations': ['united nations general assembly'],
	'fbi': ['federal bureau of investigation', 'the bureau'],
	'department of health and human services': ['dhhs', 'hhs', 'health and human services',
		'hhss office of health reform'],
	'department of homeland security': ['dhs'],
	'deparment of justice': ['doj', 'justice department'],
	'bureau of labor and statistics': ['bls'],
	'bureau of economic analysis': ['bea'],
	'centers for medicare and medicaid services': ['cms'],
	'congressional budget office': ['cbo'],
	'islamic state': ['isis'],
	'us military': ['us forces', 'pentagon', 'us troops'],
	'us media': ['white house pool', 'reporters', 'nbc news', 'washington post', 'the post',
		'defense one', 'cnn', 'cnns anderson cooper', 'cnns tal kopan', 'cnns john defterios',
		'media outlets', 'abc', 'abcs four corners' 'bloomberg news', 'american media', 
		'wall street journal', 'abs good morning america', 'access hollywood', 'dailymail.com',
		'cnbc', 'businessinsider.com', 'gallup', 'gallup panel', 'reuters'],
	'russian media': ['rt', 'sputnik'],
	'chinese media': ['chinese-language newspaper'],
	'undocumented immigrants': ['undocumented migrants', 'immigrants'],
	'immigrant separation': ['separation of parents and children', 'taken from parents',
		'separations', 'split families', 'reunite children', 'reunite families', 
		'reunite families already separated', 'reunite kids', 'separated families',
		'family separation', 'away from their kids', 'separate families'],
	'immigrant detention': ['prosecuting undocumented migrants', 'children being held in jail', 
		'locking up of entire families', 'detain families', 'detail those families', 'prosecute adults',
		'families will be detained', 'detained families', 'indefinite detention'],
	'united states-korea free trade agreement': ['korus'],
	'north american free trade agreement': ['nafta', 'nafta deal'],
	'trade war': ['us tarrif', 'us tarrifs', 'trade standoff'],
	'sanctions': ['sanctions', 'economic sanctions', 'us sanctions'],
	'economy': ['market cycle'],
	'capital expenditure': ['capex', 'capital investment'],
	'economic growth': ['recovery', 'cyclical upswing'],
	'economic suppression': ['financial crisis', 'downturn', 'recession', 'market disruption'
		'weaker growth'],
	'gross domestic product': ['gdp', 'gdp rate', 'gdp growth'],
	'unemployment': ['unemployment rate', 'manufacturing jobs', 'jobless numbers',
		'unemployment level' 'jobless rate', 'job creation', 'new jobs'],
	'stock market': ['s&p 500', 's&p 500 index', 'sp 500', 'sp 500 index', 's p 500', 's p 500 index'],
	'affordable care act': ['aca', 'aca plan', 'aca plans', 'obamacare', 'aca regulations',
		'acas regulations', 'aca health plans'],
	'short term insurance': ['short term plans', 'short term health plans', 'short-term plans',
		'short term medical plans', 'junk insurance' 'short term policies', 'skimpy plans', 
		'skimpy health plans', 'short term limited duration plans', 'bare bones plans'],
	'financial institution': ['credit suisse', 'july beige book', 'federal reserve bank',
		'business roundtable', 'beige book'],
	'chief financial officer': ['cfo', 'cfos', 'chief financial officers', 'global cfo council'],
	'twitter': ['tweet', 'tweeted', 'tweeting'],
	'inaccuracy': ['erred', 'wrong', 'inconsistency', 'false tweet', 'error', 'misquoted',
		'false'	'not even close to accurate', 'falsifying'],
	'data': ['historical data', 'benchmark', 'deletes and reposts', 'government data',
		'statistics', 'economic data', 'survey']
}

In [None]:
#test to make sure it loaded
Trump_dictionary

In [None]:
#figure out which variables are which for the function to be defined below
for k, v in Trump_dictionary.items():
    #print (k)
    print(v)

In [None]:
#loop through and use dictionary to create equivalency terms
#note: k is the key (ie: the key elements in dictionary and 
#v is the dictionary term for each key term)

### THIS DOES NOT WORK  it replaces zero words

def equivalence_class(dictionary, text):
    for k, v in sorted(dictionary.items(),reverse=True):
        for i in dictionary[k]:
            text = re.sub(r"\b%s\b" % i, k, text)
        ec_processed_text.append(text)
    return text

equivalence_class(Trump_dictionary, corpus)

In [None]:
##
############ Test Stemmer Results  ###################

In [None]:
##
########## Test Lemetization Results #################
#compare to stemmed results
#consider modeling both and see if one produces better clusters

In [None]:
##
##################  Remove stopwords  ######################

In [None]:
###
############# Tage Words before Bi/Tri grams ##################

In [None]:
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize
import string

nltk.download('averaged_perceptron_tagger')


In [None]:
nltk.download('stopwords')

all_words=nltk.FreqDist(w.lower() for w in corpus.words() 
                        if w.lower() not in nltk.corpus.stopwords.words('english'))

In [None]:
all_words

In [None]:

#plot freq distribution after removing stopwords
fdist_plot = all_words.plot(20, cumulative=False)

In [None]:
##
###########  bigrams  ###############


In [None]:
bigrams = corpus.words('Donald_Trump/PKC_Doc1_US-Canada-NAFTA.txt')

#note: if you don't use list as a wrapper, ie: only use: nltk.bigrams(bigrams), you get 
#<generator object bigrams at 0x00000204DC17A830> telling you its ready to process
#you just need to use list() to compute
#specified to only get the first 30 bigrams of the text

list(nltk.bigrams(bigrams))[0:30]

In [None]:
##
###########  tri grams  ###############



In [None]:
##
###########  n-grams  ###############

In [None]:
##
###########  tri grams  ###############

In [None]:
#################################################################
#
#   Start Modeling  (see Paul's code)
#
#################################################################