# Working with Elasticsearch and Elasticsearch Python clients

### For Elastic Stack guidance, see [Elastic Stack and Product Documentation](https://www.elastic.co/guide/index.html)

### Python Elasticsearch Client docs: http://elasticsearch-py.readthedocs.io

#### Note: To run this notebook, ensure an elasticsearch instance is active on localhost:9200.

### Import necessary modules

In [1]:
import elasticsearch
import elasticsearch_dsl
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import pathlib
from pprint import pprint
import time
import twittertools  # my local twittertools.py module

In [2]:
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
def cat_DataFrame(command):
    """
    Execute the input _cat command as verbose json,
    returning a pandas DataFrame for tabular output.
    """
    return pd.DataFrame(command(format='json', v=True))

### Instantiate Elasticsearch object

In [4]:
# Default connection to localhost:9200
es = elasticsearch.Elasticsearch()

### Print system information for reference

In [5]:
reports = [('Health:', es.cat.health), ('Nodes:', es.cat.nodes),
           ('Shards:', es.cat.shards), ('Indices:', es.cat.indices)]
for name, report in reports:
    print(name)
    cat_DataFrame(report)

Health:


Unnamed: 0,active_shards_percent,cluster,epoch,init,max_task_wait_time,node.data,node.total,pending_tasks,pri,relo,shards,status,timestamp,unassign
0,50.0%,elasticsearch,1512670835,0,-,1,1,0,6,0,6,yellow,11:20:35,6


Nodes:


Unnamed: 0,cpu,heap.percent,ip,load_15m,load_1m,load_5m,master,name,node.role,ram.percent
0,19,33,127.0.0.1,0.44,0.43,0.34,*,1Y86nkN,mdi,29


Shards:


Unnamed: 0,docs,index,ip,node,prirep,shard,state,store
0,2504.0,twitter,127.0.0.1,1Y86nkN,p,2,STARTED,1.6mb
1,,twitter,,,r,2,UNASSIGNED,
2,2600.0,twitter,127.0.0.1,1Y86nkN,p,3,STARTED,1.7mb
3,,twitter,,,r,3,UNASSIGNED,
4,2599.0,twitter,127.0.0.1,1Y86nkN,p,4,STARTED,1.7mb
5,,twitter,,,r,4,UNASSIGNED,
6,2549.0,twitter,127.0.0.1,1Y86nkN,p,1,STARTED,1.7mb
7,,twitter,,,r,1,UNASSIGNED,
8,2623.0,twitter,127.0.0.1,1Y86nkN,p,0,STARTED,1.7mb
9,,twitter,,,r,0,UNASSIGNED,


Indices:


Unnamed: 0,docs.count,docs.deleted,health,index,pri,pri.store.size,rep,status,store.size,uuid
0,2,0,yellow,.kibana,1,10.9kb,1,open,10.9kb,Yuy-vovlQyeUPVim2PcdFw
1,12875,0,yellow,twitter,5,8.6mb,1,open,8.6mb,GaS-KKppRM6IP2bqy6QQ5w


### Get tweets from a few Twitter user timelines; index them in Elasticsearch

In [6]:
# For this demonstration only, delete any existing /twitter index
result = es.indices.delete(index='twitter', ignore=[400, 404])

In [7]:
# Create Authenticated TwitterTools object
filepath = pathlib.Path.home().joinpath('.twitter', 'credentials.json')
twt = twittertools.TwitterTools(filepath)

In [8]:
all_indexed = 0
total_tweets = 0
screen_names = ['pourmecoffee', 'washingtonpost', 'brainpicker', 'wilw']
for screen_name in screen_names: 
    tweets = twt.get_user_timeline(screen_name)
    total_tweets += len(tweets)
    print(f"{len(tweets)} tweets retrieved from @{screen_name}'s timeline;", end=' ')
    total_indexed = 0
    for tweet in tweets:
        doc = twittertools.unpack_tweet(tweet)
        result = es.index(index='twitter', doc_type='tweet', body=doc)
        if result['_shards']['successful']:
            total_indexed += 1
    print(f'{total_indexed} indexed', flush=True)
    all_indexed += total_indexed
print('Total tweets indexed:', all_indexed)

3206 tweets retrieved from @pourmecoffee's timeline; 3206 indexed
3234 tweets retrieved from @washingtonpost's timeline; 3234 indexed
3235 tweets retrieved from @brainpicker's timeline; 3235 indexed
3203 tweets retrieved from @wilw's timeline; 3203 indexed
Total tweets indexed: 12878


### Confirm number of tweets indexed

In [9]:
# Expect latency (about 1 sec.) between indexing and getting
# complete search results. Introduce a short wait...

sleep_wait = 0.50
sleep_count = 0
search = elasticsearch_dsl.Search(using=es, index='twitter', doc_type='tweet')
while True:
    doc_count = search.count()
    if doc_count == all_indexed:
        break
    sleep_count += 1
    if sleep_count*sleep_wait > 5.0:  # seconds
        break
    time.sleep(sleep_wait)
print(f'\nFound {doc_count} indexed tweets')


Found 12878 indexed tweets


### Inspect all index settings and mappings

In [10]:
all_indices = elasticsearch_dsl.Index('_all', using=es)
for index_name, index_object in all_indices.get_settings().items():
    print(f'"{index_name}" index settings:')
    pprint(index_object)

"twitter" index settings:
{'settings': {'index': {'creation_date': '1512670850737',
                        'number_of_replicas': '1',
                        'number_of_shards': '5',
                        'provided_name': 'twitter',
                        'uuid': '2YNQm4K9RmuJE0XJ7wJoNA',
                        'version': {'created': '6000099'}}}}
".kibana" index settings:
{'settings': {'index': {'creation_date': '1511908972927',
                        'number_of_replicas': '1',
                        'number_of_shards': '1',
                        'provided_name': '.kibana',
                        'uuid': 'Yuy-vovlQyeUPVim2PcdFw',
                        'version': {'created': '6000099'}}}}


In [11]:
for index_name, index_object in all_indices.get_mapping().items():
    print(f'"{index_name}" index mapping:')
    index_object

"twitter" index mapping:


{'mappings': {'tweet': {'properties': {'created': {'type': 'date'},
    'expanded_urls': {'fields': {'keyword': {'ignore_above': 256,
       'type': 'keyword'}},
     'type': 'text'},
    'hashtags': {'fields': {'keyword': {'ignore_above': 256,
       'type': 'keyword'}},
     'type': 'text'},
    'media_types': {'fields': {'keyword': {'ignore_above': 256,
       'type': 'keyword'}},
     'type': 'text'},
    'media_urls': {'fields': {'keyword': {'ignore_above': 256,
       'type': 'keyword'}},
     'type': 'text'},
    'mentions': {'fields': {'keyword': {'ignore_above': 256,
       'type': 'keyword'}},
     'type': 'text'},
    'retweet_count': {'type': 'long'},
    'screen_name': {'fields': {'keyword': {'ignore_above': 256,
       'type': 'keyword'}},
     'type': 'text'},
    'text': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
     'type': 'text'},
    'tweet_id': {'fields': {'keyword': {'ignore_above': 256,
       'type': 'keyword'}},
     'type': 'text'},
   

".kibana" index mapping:


{'mappings': {'doc': {'dynamic': 'strict',
   'properties': {'config': {'dynamic': 'true',
     'properties': {'buildNum': {'type': 'keyword'},
      'defaultIndex': {'fields': {'keyword': {'ignore_above': 256,
         'type': 'keyword'}},
       'type': 'text'}}},
    'dashboard': {'properties': {'description': {'type': 'text'},
      'hits': {'type': 'integer'},
      'kibanaSavedObjectMeta': {'properties': {'searchSourceJSON': {'type': 'text'}}},
      'optionsJSON': {'type': 'text'},
      'panelsJSON': {'type': 'text'},
      'refreshInterval': {'properties': {'display': {'type': 'keyword'},
        'pause': {'type': 'boolean'},
        'section': {'type': 'integer'},
        'value': {'type': 'integer'}}},
      'timeFrom': {'type': 'keyword'},
      'timeRestore': {'type': 'boolean'},
      'timeTo': {'type': 'keyword'},
      'title': {'type': 'text'},
      'uiStateJSON': {'type': 'text'},
      'version': {'type': 'integer'}}},
    'index-pattern': {'properties': {'fieldForm

### Perform a simple match query on tweet texts

In [12]:
results = search.query("match", text="NASA").execute()
print(f'Got {results["hits"]["total"]} matches')
print('Top ten by relevance score:')
for hit in results['hits']['hits']:
    tweet = hit['_source']
    print('-', tweet['screen_name'], tweet['created'], tweet['text'])

Got 36 matches
Top ten by relevance score:
- pourmecoffee 2017-06-29T17:40:25Z @NASA @CassiniSaturn pierogi mmm
- pourmecoffee 2017-06-05T22:31:52Z @NASA Like my tweets.
- pourmecoffee 2017-07-12T18:24:04Z @NASA Finally some non-Manhatten-sized icebergs.
- pourmecoffee 2017-08-18T11:51:27Z @NASA @NASA_TDRS You're firing on Earth do you take me for a fool?
- pourmecoffee 2017-09-07T22:54:20Z Rare shot from NASA of Carl Sagan just minutes after he landed on Earth. https://t.co/XlXNWKubTn
- pourmecoffee 2017-06-27T21:43:45Z The broadcast is: "prepare to die." "NASA Celebrates International Asteroid Day with Special Broadcast" https://t.co/5MU83MlESQ
- washingtonpost 2017-11-19T15:04:44Z Perspective: Please stop annoying this NASA scientist with your ridiculous Planet X doomsday theories https://t.co/soTGVzQqqm
- washingtonpost 2017-11-28T18:58:00Z A NASA astronaut films his spacewalk — and a breathtaking view of Earth https://t.co/3WrPgggQVF
- washingtonpost 2017-11-29T08:05:43Z A NASA as