# ES_Querying

In [10]:
import json, time, os
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch

### Query

In [11]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
es.indices.open(index='corpus_one_doc')

{'acknowledged': True, 'shards_acknowledged': True}

In [37]:
## Define search query 
search_term = 'bannon'
search_object = {'query': {'match': {'content': search_term}}}

## Applying query term to indexed ES corpus
sch_obj = es.search(index="corpus_one_doc", body=search_object, size=10000) #Increase returns with size parameter, default 10

In [38]:
## Grabbing the list containing queries 
hits = sch_obj['hits']['hits']
#hits

In [39]:
hits[0]

{'_index': 'corpus_one_doc',
 '_type': '_doc',
 '_id': '144290',
 '_score': 7.608578,
 '_source': {'id': 66321,
  'title': 'War breaks out between the Steve Bannon and Jared Kushner factions in the White House',
  'publication': 'Business Insider',
  'author': 'Pamela Engel',
  'date': 1491523200000,
  'content': '’  ’ ”   As White House Chief Strategist Steve Bannon  tensions between him and senior adviser Jared   Kushner, President Donald Trump’s    burst into the   open. ” ’   Several reports in recent days have detailed the brewing   conflict.  Thursday that Bannon   had called Kushner a ”cuck” behind his back. ’ ’   Bannon also told his associates, ”I love a gunfight,” according   to  which said that ”the hatred between the two   wings” in the White House was ”intense and irreconcilable.” ’ ”   The stories of  in the White House started rolling   out in earnest this week after the White House announced Trump   was  and that Bannon would no longer be on   it. In January, Trump  tha

In [40]:
## Fetching hits' texts, publications, dates

documents = [text['_source']['content'] for text in hits]
publications = [ text['_source']['publication'] for text in hits]
dates = [text['_source']['date'] for text in hits]

In [41]:
## Organize into df

data = pd.DataFrame()

data['documents'] = documents
data['publications'] = publications
data['dates'] = dates

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1333 entries, 0 to 1332
Data columns (total 3 columns):
documents       1333 non-null object
publications    1333 non-null object
dates           1333 non-null int64
dtypes: int64(1), object(2)
memory usage: 31.3+ KB


## Sentiment Analysis

In [None]:
def get_sentence_sentiments(sentences, method = 'ML'):
       
    """
    Description: Calculate average sentiments for a list of sentences
    Parameters: 
        doc: Sentences (list)
        method: Method to calculate sentiments 
    Returns: Average score tuple for list of sentences (tuple)
    """
    
    if method == 'ML':
        
        
        
    elif mehtod == 'lexical':
        
    return 0
         

In [56]:
def get_entity_sentences(doc, entity):
    
    """
    Description: Fetch sentences with entities
    Parameters: 
        doc: Document (str)
    Returns: List of sentences containing specificied entity (list)
    """

    sentences = [sent.string.strip() for sent in nlp(doc).sents]
    entity_sentences = [s for s in sentences if entity in s.lower()]        
    #entity_sentences = [item for sublist in sentences_brexit for item in sublist]
    
    return entity_sentences

## Exploration

### Vader
Article level sentiment analysis using lexical model

In [7]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [None]:
## Initialize Vader analyser
analyser = SentimentIntensityAnalyzer()

positive, negative, neutral, compound = ([] for i in range(4))

## Calculate sentiment scores for each text
for number, i in enumerate(texts):
    
    score = analyser.polarity_scores(i)
    
    positive.append(score['pos'])
    neutral.append(score['neu'])
    negative.append(score['neg'])
    compound.append(score['compound'])

In [None]:
data['sent_comp'] = compound
data['sent_pos'] = positive
data['sent_neg'] = negative
data['sent_neu'] = neutral

In [None]:
data.publications.value_counts()

In [None]:
breitbart_score = data[data.publications == 'Breitbart'].sent_pos.sum()
national_score = data[data.publications == 'National Review'].sent_pos.sum()
fox_score = data[data.publications == 'Fox News'].sent_pos.sum()

cnn_score = data[data.publications == 'CNN'].sent_pos.sum()
npr_score = data[data.publications == 'NPR'].sent_pos.sum()
nyt_score = data[data.publications == 'New York Times'].sent_pos.sum()

print("Positive right-wing scores: \n",
      "Breitbart score: ", breitbart_score, "\n",
      "Fox News Score: ", fox_score, "\n",
      "National Review Score: ", national_score)

print("\nPositive left-wing scores: \n",
      "CNN score: ", cnn_score, "\n",
      "NPR News Score: ", npr_score, "\n",
      "NYT: ", nyt_score)

### ML Approach

Pseudocode

    - Get sentences with relevant entity
    - Use spacy dependecy parser to figure our phrases/sentences that refer to our entity
    - Run sentiment analysis on these phrases, calculate list of sentiments for the entity per document
    - Average the sentiment scores for that document


In [76]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

In [120]:
document = data['documents'][15]
document

'WASHINGTON  —   For the first 10 weeks of President Trump’s administration, no adviser loomed larger in the public imagination than Stephen K. Bannon, the raw and rumpled former chairman of Breitbart News who considers himself a “virulently  ” revolutionary out to destroy the “administrative state. ” But behind the scenes, White House officials said, the ideologist who enjoyed the president’s confidence became increasingly embattled as other advisers, including Mr. Trump’s daughter and    complained about setbacks on health care and immigration. Lately, Mr. Bannon has been conspicuously absent from some meetings. And now he has lost his seat at the national security table. In a move that was widely seen as a sign of changing fortunes, Mr. Trump removed Mr. Bannon, his chief strategist, from the National Security Council’s   “principals committee” on Wednesday. The shift was orchestrated by Lt. Gen. H. R. McMaster, Mr. Trump’s national security adviser, who insisted on purging a politi

In [121]:
entity_sentences = get_entity_sentences(document, search_term)
entity_sentences

['For the first 10 weeks of President Trump’s administration, no adviser loomed larger in the public imagination than Stephen K. Bannon, the raw and rumpled former chairman of Breitbart News who considers himself a “virulently  ” revolutionary out to destroy the “administrative state.',
 'Lately, Mr. Bannon has been conspicuously absent from some meetings.',
 'In a move that was widely seen as a sign of changing fortunes, Mr. Trump removed Mr. Bannon, his chief strategist, from the National Security Council’s   “principals committee” on Wednesday.',
 'Mr. Bannon resisted the move, even threatening at one point to quit if it went forward, according to a White House official who, like others, insisted on anonymity to discuss internal deliberations.',
 'Mr. Bannon’s camp denied that he had threatened to resign and spent the day spreading the word that the shift was a natural evolution, not a signal of any diminution of his outsize influence.',
 'His allies said privately that Mr. Bannon h

In [122]:
#example = entity_sentences[6]
example = 'For the first 10 weeks of President Trump’s administration, no adviser loomed larger in the public imagination than Stephen K. Bannon, the raw and rumpled former chairman of Breitbart News who considers himself a “virulently  ” revolutionary out to destroy the “administrative state.'
example

'For the first 10 weeks of President Trump’s administration, no adviser loomed larger in the public imagination than Stephen K. Bannon, the raw and rumpled former chairman of Breitbart News who considers himself a “virulently  ” revolutionary out to destroy the “administrative state.'

In [123]:
## Textblob example

from textblob import TextBlob

TextBlob(example).sentiment

Sentiment(polarity=-0.030128205128205132, subjectivity=0.22692307692307692)

In [124]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

analyser = SentimentIntensityAnalyzer()

score = analyser.polarity_scores(example)

positive = score['pos']
neutral = score['neu']
negative = score['neg']

print('Positive: ', positive, "\nNegative: ", negative, "\nNeutral: ", neutral)

Positive:  0.0 
Negative:  0.17 
Neutral:  0.83


## Dependency Tree Visualizations

In [59]:
from spacy import displacy

displacy.render(nlp(entity_sentences[12]), style='dep',jupyter=True)

In [85]:
from nltk import Tree


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_
    
doc = nlp(entity_sentences[6])
print(doc)

[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]


A White House official portrayed the change as a natural progression rather than a demotion for Bannon.
                              portrayed                                        
  ________________________________|__________________                           
 |         |             |                           as                        
 |         |             |                           |                          
 |         |             |                      progression                    
 |         |             |         __________________|_____________             
 |      official         |        |        |         |          demotion       
 |    _____|_______      |        |        |         |        _____|_______     
 |   |           House change     |        |        than     |            for  
 |   |             |     |        |        |         |       |             |    
 .   A           White  the       a     natural    rather    a           Bannon



[None]

In [None]:
for s in sentences_brexit:
    
    s = nlp(s) 
    
    for token in s:
        
        print("Token: ", token, "\n")
        print("\n\tToken.dep_: ", token.dep_, "\n\tToken.head: ", token.head, "\n\tToken.head.dep_: ",  token.head.dep_, "\n")

In [None]:
import sys
for p in sys.path:
    print(p)

### Useful function calls

In [None]:
## Fetch document by id
es.get(index = 'corpus1', id=1)

In [None]:
## Vader snippet test

snippet = ' It amounts, instead, to a deliberate and cynical failure to implement the 2016 referendum result.'
analyser.polarity_scores(snippet)

In [None]:
scores = np.array([np.array(xi) for xi in scores])
mean_scores = np.mean(scores, axis=0)

In [None]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz