# ES_Querying

In [1]:
import json, time, os
import pandas as pd
from elasticsearch import Elasticsearch

### Query

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [15]:
## Define search query 
search_object = {'query': {'match': {'content': 'trump'}}}

## Applying query term to indexed ES corpus
sch_obj = es.search(index="corpus1", body=search_object, size=1000) #Increase returns with size parameter, default 10
#sch_obj

In [16]:
# Grabbing the list containing queries 
hits = sch_obj['hits']['hits']
#hits

In [17]:
## Fetching hit's texts, sources, publications

texts = [text['_source']['content'] for text in hits]
publications = [ text['_source']['publication'] for text in hits]
dates = [text['_source']['date'] for text in hits]

## Sentiment Analysis

### Vader
Article level sentiment analysis using lexical model

In [18]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [19]:
## Initialize Vader analyser
analyser = SentimentIntensityAnalyzer()

positive, negative, neutral, compound = ([] for i in range(4))

## Calculate sentiment scores for each text
for number, i in enumerate(texts):
    
    score = analyser.polarity_scores(i)
    
    positive.append(score['pos'])
    neutral.append(score['neu'])
    negative.append(score['neg'])
    compound.append(score['compound'])

In [20]:
## Organize into df

data = pd.DataFrame()

data['texts'] = texts
data['publications'] = publications
data['dates'] = dates
data['sent_comp'] = compound
data['sent_pos'] = positive
data['sent_neg'] = negative
data['sent_neu'] = neutral

In [21]:
data.publications.value_counts()

Washington Post        216
Breitbart              165
CNN                    113
Business Insider        73
Vox                     66
National Review         59
Atlantic                57
New York Times          52
NPR                     48
Fox News                44
Buzzfeed News           30
Reuters                 27
Talking Points Memo     22
New York Post           16
Guardian                12
Name: publications, dtype: int64

In [22]:
breitbart_score = data[data.publications == 'Breitbart'].sent_pos.sum()
national_score = data[data.publications == 'National Review'].sent_pos.sum()
fox_score = data[data.publications == 'Fox News'].sent_pos.sum()

cnn_score = data[data.publications == 'CNN'].sent_pos.sum()
npr_score = data[data.publications == 'NPR'].sent_pos.sum()
nyt_score = data[data.publications == 'New York Times'].sent_pos.sum()

print("Compound right-wing scores: \n",
      "Breitbart score: ", breitbart_score, "\n",
      "Fox News Score: ", fox_score, "\n",
      "National Review Score: ", national_score)

print("\nCompound left-wing scores: \n",
      "CNN score: ", cnn_score, "\n",
      "NPR News Score: ", npr_score, "\n",
      "NYT: ", nyt_score)

Compound right-wing scores: 
 Breitbart score:  16.817999999999998 
 Fox News Score:  3.6390000000000002 
 National Review Score:  6.5249999999999995

Compound left-wing scores: 
 CNN score:  10.788 
 NPR News Score:  4.496 
 NYT:  4.912


### ML Approach

Pseudocode

    - Get sentences with relevant entity
    - Use spacy dependecy parser to figure our phrases that refer to our entity
    - Run sentiment analysis on these phrases, calculate list of sentiments for the entity per document
    - Average the sentiment scores for that document


### Useful function calls

In [None]:
## Fetch document by id
es.get(index = 'corpus1', id=1)

In [None]:
## Vader snippet test

snippet = ' It amounts, instead, to a deliberate and cynical failure to implement the 2016 referendum result.'
analyser.polarity_scores(snippet)