# ES_Querying

In [1]:
import json, time, os
import pandas as pd
from elasticsearch import Elasticsearch

### Query

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [6]:
## Define search query 
search_object = {'query': {'match': {'content': 'brexit'}}}

## Applying query term to indexed ES corpus
sch_obj = es.search(index="corpus1", body=search_object, size=100) #Increase returns with size parameter, default 10

In [8]:
## Grabbing the list containing queries 
hits = sch_obj['hits']['hits']
#hits

In [9]:
## Fetching hit's texts, sources, publications

texts = [text['_source']['content'] for text in hits]
publications = [ text['_source']['publication'] for text in hits]
dates = [text['_source']['date'] for text in hits]

In [10]:
## Organize into df

data = pd.DataFrame()

data['texts'] = texts
data['publications'] = publications
data['dates'] = dates

In [11]:
data.head()

Unnamed: 0,texts,publications,dates
0,Remain campaign supporters are rushing to blam...,Breitbart,1466035200000
1,The EU may be agreed on its response to Britai...,Guardian,1476835200000
2,Theresa May’s insistence on starting Brexit n...,Reuters,1497139200000
3,"Ulster’s Democratic Unionist Party, which is e...",Breitbart,1497571200000
4,Actor Matt Damon has weighed into the UK refer...,Breitbart,1464912000000


## Sentiment Analysis

### Vader
Article level sentiment analysis using lexical model

In [12]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [13]:
## Initialize Vader analyser
analyser = SentimentIntensityAnalyzer()

positive, negative, neutral, compound = ([] for i in range(4))

## Calculate sentiment scores for each text
for number, i in enumerate(texts):
    
    score = analyser.polarity_scores(i)
    
    positive.append(score['pos'])
    neutral.append(score['neu'])
    negative.append(score['neg'])
    compound.append(score['compound'])

In [14]:
data['sent_comp'] = compound
data['sent_pos'] = positive
data['sent_neg'] = negative
data['sent_neu'] = neutral

In [15]:
data.publications.value_counts()

Breitbart           24
National Review     14
Reuters             13
Guardian            12
Washington Post      8
Vox                  7
NPR                  6
New York Times       4
Business Insider     3
New York Post        3
CNN                  3
Atlantic             2
Fox News             1
Name: publications, dtype: int64

In [17]:
breitbart_score = data[data.publications == 'Breitbart'].sent_pos.sum()
national_score = data[data.publications == 'National Review'].sent_pos.sum()
fox_score = data[data.publications == 'Fox News'].sent_pos.sum()

cnn_score = data[data.publications == 'CNN'].sent_pos.sum()
npr_score = data[data.publications == 'NPR'].sent_pos.sum()
nyt_score = data[data.publications == 'New York Times'].sent_pos.sum()

print("Positive right-wing scores: \n",
      "Breitbart score: ", breitbart_score, "\n",
      "Fox News Score: ", fox_score, "\n",
      "National Review Score: ", national_score)

print("\nPositive left-wing scores: \n",
      "CNN score: ", cnn_score, "\n",
      "NPR News Score: ", npr_score, "\n",
      "NYT: ", nyt_score)

Positive right-wing scores: 
 Breitbart score:  1.9969999999999999 
 Fox News Score:  0.143 
 National Review Score:  1.657

Positive left-wing scores: 
 CNN score:  0.329 
 NPR News Score:  0.46799999999999997 
 NYT:  0.388


### ML Approach

Pseudocode

    - Get sentences with relevant entity
    - Use spacy dependecy parser to figure our phrases that refer to our entity
    - Run sentiment analysis on these phrases, calculate list of sentiments for the entity per document
    - Average the sentiment scores for that document


In [20]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

In [26]:
## Get sentences with relevant entity

for text in data.texts:
    
    sentences = [sent.string.strip() for sent in nlp(text).sents]
    
    #print(sentences)
    
    sentences_brexit = [s for s in sentences if "brexit" in s.lower()]
    

In [34]:
!pip install nltk



You are using pip version 18.0, however version 19.2.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [37]:
from nltk import Tree


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_
    
doc = nlp(sentences_brexit[10])
print(doc)

[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]


The SNP Won’t Let the Issue of Scottish Independence GoAfter the U. K. Election, Expect Brexit to Continue ApaceBrexit
                             Let                                                          
  ____________________________|______________________________                              
 |   |   |       Issue                                       |                            
 |   |   |    _____|__________                               |                             
 |   |   |   |                of                             |                            
 |   |   |   |                |                              |                             
 |   |   |   |             GoAfter                           |                            
 |   |   |   |      __________|______________                |                             
 |   |   |   |     |          |           Election         Expect                         
 |   |   |   |     |          |         _____|______     _

[None]

In [33]:
for s in sentences_brexit:
    
    s = nlp(s) 
    
    for token in s:
        
        print("Token: ", token, "\n")
        print("\n\tToken.dep_: ", token.dep_, "\n\tToken.head: ", token.head, "\n\tToken.head.dep_: ",  token.head.dep_, "\n")

Token:  As 


	Token.dep_:  mark 
	Token.head:  vote 
	Token.head.dep_:  advcl 

Token:  Britons 


	Token.dep_:  nsubj 
	Token.head:  vote 
	Token.head.dep_:  advcl 

Token:  vote 


	Token.dep_:  advcl 
	Token.head:  argue 
	Token.head.dep_:  ROOT 

Token:  tomorrow 


	Token.dep_:  npadvmod 
	Token.head:  vote 
	Token.head.dep_:  advcl 

Token:  to 


	Token.dep_:  prep 
	Token.head:  vote 
	Token.head.dep_:  advcl 

Token:  Brexit 


	Token.dep_:  pobj 
	Token.head:  to 
	Token.head.dep_:  prep 

Token:  from 


	Token.dep_:  prep 
	Token.head:  vote 
	Token.head.dep_:  advcl 

Token:  or 


	Token.dep_:  cc 
	Token.head:  from 
	Token.head.dep_:  prep 

Token:  Bremain 


	Token.dep_:  conj 
	Token.head:  from 
	Token.head.dep_:  prep 

Token:  in 


	Token.dep_:  prep 
	Token.head:  vote 
	Token.head.dep_:  advcl 

Token:  the 


	Token.dep_:  det 
	Token.head:  Union 
	Token.head.dep_:  pobj 

Token:  European 


	Token.dep_:  compound 
	Token.head:  Union 
	Token.head.dep_:  po

	Token.head.dep_:  ROOT 

Token:  . 


	Token.dep_:  punct 
	Token.head:  contend 
	Token.head.dep_:  ROOT 

Token:  Brexit 


	Token.dep_:  nsubj 
	Token.head:  make 
	Token.head.dep_:  ROOT 

Token:  will 


	Token.dep_:  aux 
	Token.head:  make 
	Token.head.dep_:  ROOT 

Token:  make 


	Token.dep_:  ROOT 
	Token.head:  make 
	Token.head.dep_:  ROOT 

Token:  everyone 


	Token.dep_:  nsubj 
	Token.head:  think 
	Token.head.dep_:  ccomp 

Token:  think 


	Token.dep_:  ccomp 
	Token.head:  make 
	Token.head.dep_:  ROOT 

Token:  again 


	Token.dep_:  advmod 
	Token.head:  think 
	Token.head.dep_:  ccomp 

Token:  , 


	Token.dep_:  punct 
	Token.head:  force 
	Token.head.dep_:  conj 

Token:  force 


	Token.dep_:  conj 
	Token.head:  make 
	Token.head.dep_:  ROOT 

Token:  us 


	Token.dep_:  dobj 
	Token.head:  force 
	Token.head.dep_:  conj 

Token:  to 


	Token.dep_:  aux 
	Token.head:  ask 
	Token.head.dep_:  xcomp 

Token:  ask 


	Token.dep_:  xcomp 
	Token.head:  force 
	T

	Token.head:  appear 
	Token.head.dep_:  advcl 

Token:  back 


	Token.dep_:  advmod 
	Token.head:  firmed 
	Token.head.dep_:  xcomp 

Token:  up 


	Token.dep_:  prt 
	Token.head:  firmed 
	Token.head.dep_:  xcomp 

Token:  to 


	Token.dep_:  prep 
	Token.head:  up 
	Token.head.dep_:  prt 

Token:  a 


	Token.dep_:  det 
	Token.head:  tie 
	Token.head.dep_:  pobj 

Token:  virtual 


	Token.dep_:  amod 
	Token.head:  tie 
	Token.head.dep_:  pobj 

Token:  tie 


	Token.dep_:  pobj 
	Token.head:  to 
	Token.head.dep_:  prep 

Token:  . 


	Token.dep_:  punct 
	Token.head:  showed 
	Token.head.dep_:  ROOT 

Token:  This 


	Token.dep_:  nsubj 
	Token.head:  yield 
	Token.head.dep_:  ROOT 

Token:  could 


	Token.dep_:  aux 
	Token.head:  yield 
	Token.head.dep_:  ROOT 

Token:  yield 


	Token.dep_:  ROOT 
	Token.head:  yield 
	Token.head.dep_:  ROOT 

Token:  a 


	Token.dep_:  det 
	Token.head:  victory 
	Token.head.dep_:  dobj 

Token:  Brexit 


	Token.dep_:  compound 
	Token.he

In [None]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz

In [None]:
import sys
for p in sys.path:
    print(p)

### Useful function calls

In [None]:
## Fetch document by id
es.get(index = 'corpus1', id=1)

In [None]:
## Vader snippet test

snippet = ' It amounts, instead, to a deliberate and cynical failure to implement the 2016 referendum result.'
analyser.polarity_scores(snippet)