In [1]:
%%bash
sudo apt-get update
sudo apt-get install default-jre

Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:11 http://security.ubuntu.com/ubuntu bionic-security/restricted amd64 Packages [685 kB]
Get:12 http://security.ubuntu.c

In [2]:
import os
import time
from subprocess import Popen, PIPE, STDOUT

In [3]:
%%bash
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.0.0-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.0.0-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.0.0

In [4]:
time.sleep(30)

In [5]:
es_server = Popen(['elasticsearch-7.0.0/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)
                 )

In [6]:
time.sleep(30)

In [7]:
%%bash
curl -X GET "localhost:9200/"

{
  "name" : "3ae48a305a99",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "cZJpAYwNQZ6Hy-Ux7AFtvw",
  "version" : {
    "number" : "7.0.0",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "b7e28a7",
    "build_date" : "2019-04-05T22:55:32.697037Z",
    "build_snapshot" : false,
    "lucene_version" : "8.0.0",
    "minimum_wire_compatibility_version" : "6.7.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   504  100   504    0     0   6631      0 --:--:-- --:--:-- --:--:--  6631


In [8]:
%%bash
pip install elasticsearch -q

In [9]:
import pandas as pd
import numpy as np
import pprint
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [10]:
es = Elasticsearch()

### Create an index

In [11]:
es.indices.create(index="test")

{'acknowledged': True, 'index': 'test', 'shards_acknowledged': True}

In [12]:
es.indices.exists(index="test")

True

### Delete an index

In [13]:
es.indices.delete(index="test")

{'acknowledged': True}

In [14]:
es.indices.exists(index="test")

False

### Insert and get data

In [15]:
doc_1 = {"city": "Paris", "country": "France"}
doc_2 = {"city": "Rome", "country": "Italy"}
doc_3 = {"city": "Berlin", "country": "Germany"}
doc_4 = {"city": "Ottawa", "country": "Canada"}
doc_5 = {"city": "Madrid", "country": "Spain"}

In [16]:
for number in range(1,6):
    record_data = locals().get("doc_" + str(number))
    es.index(index="cities", doc_type="geography", id=number, body=record_data)

  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
record_data_2 = es.get(index="cities", doc_type="geography", id=2)
pprint.pprint(record_data_2)

{'_id': '2',
 '_index': 'cities',
 '_primary_term': 1,
 '_seq_no': 1,
 '_source': {'city': 'Rome', 'country': 'Italy'},
 '_type': 'geography',
 '_version': 1,
 'found': True}




In [18]:
doc_6_7 = [{"city": "Tokyo", "country": "Japan"},{"city": "Moscow", "country": "Russia"}]

In [19]:
es_data = []

In [20]:
for id, source in zip (range(6,8),doc_6_7):
    es_data.append({
    "_index": "cities",
    "_type": "geography",
    "_id": id,
    "_source": source
  })

In [21]:
es_data

[{'_id': 6,
  '_index': 'cities',
  '_source': {'city': 'Tokyo', 'country': 'Japan'},
  '_type': 'geography'},
 {'_id': 7,
  '_index': 'cities',
  '_source': {'city': 'Moscow', 'country': 'Russia'},
  '_type': 'geography'}]

In [22]:
helpers.bulk(es, es_data)



(2, [])

In [23]:
record_data_7 = es.get(index="cities", doc_type="geography", id=7)
pprint.pprint(record_data_7)

{'_id': '7',
 '_index': 'cities',
 '_primary_term': 1,
 '_seq_no': 6,
 '_source': {'city': 'Moscow', 'country': 'Russia'},
 '_type': 'geography',
 '_version': 1,
 'found': True}




### Search query and matching documents

In [24]:
doc_8 = {"text":"This is the first test text!"}
doc_9 = {"text":"This is the second test text!"}
doc_10 = {"text":"Hi, Elasticsearch! This is the third test text!"}

In [25]:
es.index(index="test", doc_type="messages", id=1, body=doc_8)
es.index(index="test", doc_type="messages", id=2, body=doc_9)
es.index(index="test", doc_type="messages", id=3, body=doc_10)

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


{'_id': '3',
 '_index': 'test',
 '_primary_term': 1,
 '_seq_no': 2,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_type': 'messages',
 '_version': 1,
 'result': 'created'}

In [26]:
es.get(index="test", doc_type="messages",id=3)



{'_id': '3',
 '_index': 'test',
 '_primary_term': 1,
 '_seq_no': 2,
 '_source': {'text': 'Hi, Elasticsearch! This is the third test text!'},
 '_type': 'messages',
 '_version': 1,
 'found': True}

In [27]:
body = {
    "from":1,
    "size":3,
    "query": {
        "match": {
            "text":"This"
        }
    }
}

In [28]:
res = es.search(index="test", body=body)
pprint.pprint(res)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 57}


  """Entry point for launching an IPython kernel.


### Combining queries

In [29]:
body = {
    "from":0,
    "size":3,
    "query": {
        "bool": {
            "must": {
                "match": {
                    "text":"test text!"
                }
            },
            "should": {
                "match": {
                    "text": "Hi, Elasticsearch!"
                }
            }
        }
    }
}

In [30]:
res = es.search(index="test", body=body)
pprint.pprint(res)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 16}


  """Entry point for launching an IPython kernel.


In [31]:
body = {
    "from":0,
    "size":3,
    "query": {
        "bool": {
            "must": {
                "match": {
                    "text":"test text!"
                }
            },
            "must_not": {
                "match": {
                    "text": "Hi, Elasticsearch!"
                }
            }
        }
    }
}

In [32]:
res = es.search(index="test", body=body)
pprint.pprint(res)

  """Entry point for launching an IPython kernel.


{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 9}


In [33]:
body = {
    "from":0,
    "size":3,
    "query": {
        "bool": {
            "must": {
                "match": {
                    "text":"test text!"
                }
            },
            "filter": {
                "match": {
                    "text": "Hi, Elasticsearch!"
                }
            }
        }
    }
}

In [34]:
res = es.search(index="test", body=body)
pprint.pprint(res)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 2}


  """Entry point for launching an IPython kernel.


In [35]:
body = {
    "from":0,
    "size":3,
    "query": {
        "match": {
           "text": {"query":"Hi This",
                      "operator":"and"}
        }
    }
}

In [36]:
res = es.search(index="test", body=body)
pprint.pprint(res)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}


  """Entry point for launching an IPython kernel.


### Regular Expressions Queries
### Error!

In [37]:
body = {
    "from":0,
    "size":3,
    "query": {
        "regexp": {
            "text":{"value":"This is the (first|second) test text!",
                    "flags":"ALL"}
        }
    }
}

In [38]:
res = es.search(index="test", body=body)
pprint.pprint(res)

  """Entry point for launching an IPython kernel.


{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 34}
