# Working with Elasticsearch

### For guidance, see [Elastic Stack and Product Documentation](https://www.elastic.co/guide/index.html)

### Import necessary modules

In [1]:
# Python Elasticsearch Client docs at http://elasticsearch-py.readthedocs.io
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import pathlib
import time
# twittertools is my local twittertools.py module
import twittertools

### Instantiate Elasticsearch object

In [2]:
# Default connection to localhost:9200
es = Elasticsearch()

### Print current indices for reference

In [3]:
print(es.cat.indices())

yellow open twitter HxCSFDvTT7W_4tFOnkr-IQ 5 1 12820 0  8.7mb  8.7mb
yellow open .kibana Yuy-vovlQyeUPVim2PcdFw 1 1     2 0 10.9kb 10.9kb



### Get tweets from a few Twitter user timelines; index them in Elasticsearch

In [4]:
# For this demonstration only, delete any existing /twitter index
result = es.indices.delete(index='twitter', ignore=[400, 404])

In [5]:
# Create Authenticated TwitterTools object
filepath = pathlib.Path.home().joinpath('.twitter', 'credentials.json')
twt = twittertools.TwitterTools(filepath)

In [6]:
all_indexed = 0
total_tweets = 0
screen_names = ['pourmecoffee', 'washingtonpost', 'brainpicker', 'wilw']
for screen_name in screen_names: 
    tweets = twt.get_user_timeline(screen_name)
    total_tweets += len(tweets)
    print(f"{len(tweets)} tweets retrieved from @{screen_name}'s timeline;", end=' ')
    total_indexed = 0
    for tweet in tweets:
        doc = twittertools.unpack_tweet(tweet)
        result = es.index(index='twitter', doc_type='tweet', body=doc)
        if result['_shards']['successful']:
            total_indexed += 1
    print(f'{total_indexed} indexed', flush=True)
    all_indexed += total_indexed
print('Total tweets indexed:', all_indexed)

3203 tweets retrieved from @pourmecoffee's timeline; 3203 indexed
3206 tweets retrieved from @washingtonpost's timeline; 3206 indexed
3217 tweets retrieved from @brainpicker's timeline; 3217 indexed
3194 tweets retrieved from @wilw's timeline; 3194 indexed
Total tweets indexed: 12820


### Confirm number of tweets indexed

In [7]:
# There may be latency between indexing and getting
# complete search results. Introduce a short wait...
max_wait = 5.0  # seconds
sleep_wait = 0.50
sleep_count = 0
search = Search(using=es, index='twitter', doc_type='tweet')
while True:
    doc_count = search.count()
    if doc_count == all_indexed:
        break
    sleep_count += 1
    if sleep_count*sleep_wait > max_wait:
        break
    time.sleep(sleep_wait)
print(f'\nFound {doc_count} indexed tweets')


Found 12820 indexed tweets


### Perform simple match query on tweet texts

In [8]:
results = search.query("match", text="NASA").execute()
print(f'Got {results["hits"]["total"]} matches')
print('Top ten by relevance score:')
for hit in results['hits']['hits']:
    tweet = hit['_source']
    print('-', tweet['screen_name'], tweet['created'], tweet['text'])

Got 36 matches
Top ten by relevance score:
- pourmecoffee 2017-06-05T22:31:52Z @NASA Like my tweets.
- pourmecoffee 2017-06-29T17:40:25Z @NASA @CassiniSaturn pierogi mmm
- pourmecoffee 2017-07-12T18:24:04Z @NASA Finally some non-Manhatten-sized icebergs.
- pourmecoffee 2017-09-10T01:43:52Z NASA engineers these releases every ten years to keep the people docile. I've said too much already.
- pourmecoffee 2017-08-18T11:51:27Z @NASA @NASA_TDRS You're firing on Earth do you take me for a fool?
- washingtonpost 2017-11-29T08:05:43Z A NASA astronaut films his spacewalk — and a breathtaking view of Earth https://t.co/ZaCEVRbTME
- washingtonpost 2017-11-15T16:12:53Z These close-up images from NASA show one of the largest icebergs to ever split off from Antarctica https://t.co/Rxa7ScbGII
- pourmecoffee 2017-07-23T01:22:29Z This is it. Just rely on your training and your unit now. "NASA Watches a Sunspot Turn Toward Earth"… https://t.co/RsNmQrLo35
- washingtonpost 2017-11-19T15:04:44Z Perspectiv