In [None]:
#Bush press briefing analysis script, using compiled Jan-March 2001 press briefings (see Extract_text.. file)

In [6]:
import nltk

#Note: did not end up needing textblob, pattern library uses same sentiment classifier

from textblob import *
from textblob.sentiments import NaiveBayesAnalyzer

import re
import csv

#pattern library.. simple NLP tools based on product reviews
from pattern.en import parse, Sentence, parse
from pattern.en import modality
from pattern.en import mood
from pattern.en import sentiment
from pattern.en import positive

#for grade-level score, sentence complexity
from textstat.textstat import textstat

In [2]:
#NLTK sentence parsing
#Note: nltk sentence parser seems to look for ". ".. does not parse on other interruptions like "--"

f = open('EDAVtest_compiled.txt')
raw = f.read()
sents = nltk.sent_tokenize(raw)
#print sents_bush

In [3]:
#test nlp metrics.. sentence, if_positive_sent, mood (aka sentence type), modality (aka certainty), polarity, subjectivity
#Note: TextBlob sentiment analysis, in this package, is based on pattern library, see link
#http://www.clips.ua.ac.be/pages/pattern-en#sentiment

for i in sents[:100]:
    if "FLEISCHER:" in i:
        print i, positive(i, threshold = 0.1), mood(i), modality(i), sentiment(i)[0], sentiment(i)[1]

FLEISCHER: Thanks for spending your early afternoon with me. True indicative 0.75 0.15 0.25
MR. FLEISCHER: I think what is most important is to protect the
economy. True indicative 0.65 0.45 0.75
I thought he was saying there was room --
MR. FLEISCHER: He is talking about in the soft economy --
Q  He was saying there was room in the surplus for cutting
taxes. False indicative 0.678571428571 -0.25 0.625
MR. FLEISCHER: But he also talked about an economic softness. True indicative 0.75 0.2 0.2
MR. FLEISCHER: I think he referred to the fact that in a time of
economic softness, tax cuts could be helpful. True conditional 0.21875 0.2 0.2
Q  But isn't the opposite true, didn't he make the opposite
--
MR. FLEISCHER: I saw that, actually, in an AP wire story. False indicative 0.75 0.0875 0.1875
MR. FLEISCHER: He also said -- and again, there are a series of
reasons -- let me repeat the three reasons President Bush believes we
should cut taxes --
Q  But can you skip to this issue --
MR. FLEISCH

In [9]:
#tagging nltk-produced "sentences" by speaker.. there were some cases where there were interjections.
#in the cases where a single "sentence" was really a single statement made up of multiple sentences split by "--", I chose to tag as being by the spokesperson.
#a subsequent script would parse by those "--" markers first, in conjunction with "Q " or "[SPOKESPERSON NAME]:", then break those statements up into individual sentences

#metrics:

    #text-based/known:
        #administration - aka "Trump", "Obama", or "Bush"
        #sentence/string
        #person: "spokesperson" or "press_corps"
        
    #from pattern library: http://www.clips.ua.ac.be/pages/pattern-en#sentiment
        #mood(string) - The mood() function returns either INDICATIVE, IMPERATIVE, CONDITIONAL or SUBJUNCTIVE for a given parsed Sentence. See the table below for an overview of moods.
        #modality(string) - The modality() function returns the degree of certainty as a value between -1.0 and +1.0, where values > +0.5 represent facts. For example, "I wish it would stop raining" scores -0.35, whereas "It will stop raining" scores +0.75. Accuracy is about 68% for Wikipedia texts.
        #sentiment: polarity & subjectivity, in that order
   
    #from textstat: 
        #flesch-kincaid grade level, based on flesch-kincaid grade formula

#create simple variable in anticipation of merging multiple tables in R, for visualization purposes
admin = "Bush"

with open('EDAVtest_tagged.csv', 'wb') as file:
    writer = csv.writer(file)

    #cleaning.. in some cases a Q marking a press corp question had no space btw it and the question. 
    #"LGBTQ" also screwed up the subsequent logic used to identify press corp questions based on "Q " tags
    #Hence, change LGBTQ to LGBT
    
    for string in sents:
        string = re.sub(r'(Q+)([A-Z])', r'\1 \2', string)
        string = string.replace('MR.', '').replace('\r', '').replace('\n', ' ').replace('LGBTQ', 'LGBT')
       
    #tag speaker
    
        if "FLEISCHER:" in string:
            
            string = string.replace('FLEISCHER:','').lstrip()
            person = "Spokesperson"
            row = admin, string, person, positive(string, threshold = 0.1), mood(string), modality(string), sentiment(string)[0], sentiment(string)[1], textstat.flesch_kincaid_grade(string)
            #print person, [string]
            writer.writerow(row)

        elif ("Q " or "Q--" or "Q  ") in string:
        
            string = string.replace('Q ', '').replace('Q--', '').replace('Q  ', '').lstrip()
            person = "Press_Corps"
            row = admin, string, person, positive(string, threshold = 0.1), mood(string), modality(string), sentiment(string)[0], sentiment(string)[1], textstat.flesch_kincaid_grade(string)
            #print person, [string]
            writer.writerow(row)
            
    #if no tag, continue with existing until next found
    
        else:
            if len(string) != 0:
                string = string.lstrip()
                row = admin, string, person, positive(string, threshold = 0.1), mood(string), modality(string), sentiment(string)[0], sentiment(string)[1], textstat.flesch_kincaid_grade(string)
                #print person, [string]
                writer.writerow(row)
            else:
                continue
            
     

In [7]:
#original textblob sentiment analysis.. had not broken out by speaker at this point

speaker = ' '.join(sents)
test = TextBlob(speaker)

with open('EDAVtest_sentiment.csv', "wb") as file:
    writer = csv.writer(file)
    for sentence in test.sentences:
        row = sentence, sentence.sentiment.polarity, sentence.sentiment.subjectivity
        writer.writerow(row)
    
#for sentence in test.sentences[0:10]:
    #print sentence, sentence.sentiment.polarity, sentence.sentiment.subjectivity
    


In [8]:
#verify textblob and pattern library produce same sentiment results.. 
#had initially used textblob but pattern library came in handy for addt'l features

for i in sents[:100]:
    if "FLEISCHER:" in i:
        blob = TextBlob(i)
        if (blob.sentiment.polarity == sentiment(i)[0]) & (blob.sentiment.subjectivity == sentiment(i)[1]):
            print "TRUE"

TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE


In [None]:
#textblob demo, from website

from textblob import TextBlob
>>> from textblob.sentiments import NaiveBayesAnalyzer
>>> blob = TextBlob("I love this library", analyzer=NaiveBayesAnalyzer())
>>> blob.sentiment
Sentiment(classification='pos', p_pos=0.7996209910191279, p_neg=0.2003790089808724)