## Deep Dive: ElasticSearch 

July, 2020

In [90]:
from elasticsearch import Elasticsearch

In [91]:
es = Elasticsearch([{'host':'localhost','port':9200}])
es

<Elasticsearch([{'host': 'localhost', 'port': 9200}])>

### Info of the cluster

In [92]:
es.info()

{'name': '221475e93fee',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'SC2Rm6JKQPmqYm_M6EQ3Mg',
 'version': {'number': '7.7.1',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': 'ad56dce891c901a492bb1ee393f12dfff473a423',
  'build_date': '2020-05-28T16:30:01.040088Z',
  'build_snapshot': False,
  'lucene_version': '8.5.1',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

* Indices in the cluster

In [95]:
print(es.cat.indices())

yellow open bill_meta mg70Dj99TbaTPDWiOfEb0A 1 1  671583 0  2.2gb  2.2gb
yellow open bill_text WynB9PhVQySS6KkLz7fc4Q 1 1 1102418 0 12.1gb 12.1gb



* Health of the cluster

In [47]:
print(es.cat.health())

1593612108 14:01:48 docker-cluster yellow 1 1 2 2 0 0 2 0 - 50.0%



* Docs in the cluster

In [48]:
## all indices
es.count()

{'count': 1774001,
 '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0}}

In [49]:
## specific index
es.count(index="bill_text")

{'count': 1102418,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}

In [50]:
## specific index
es.count(index="bill_meta")

{'count': 671583,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}

### Add docs in index

In [94]:
sponsor = {
      "people_id": 12515,
      "person_hash": "z27rb385",
      "party_id": 2,
      "party": "R",
      "role_id": 2,
      "role": "Sen",
      "name": "Gerald Dial",
      "first_name": "Gerald",
      "middle_name": "",
      "last_name": "Dial",
      "suffix": "",
      "nickname": "",
      "district": "SD-013",
      "ftm_eid": 6381436,
      "votesmart_id": 5598,
      "opensecrets_id": "",
      "ballotpedia": "Gerald_Dial",
      "sponsor_type_id": 1,
      "sponsor_order": 1,
      "committee_sponsor": 0,
      "committee_id": "0"
}

In [96]:
es.index(index="demo", id=1, body=sponsor)

{'_index': 'demo',
 '_type': '_doc',
 '_id': '1',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [97]:
print(es.cat.indices())

yellow open bill_meta mg70Dj99TbaTPDWiOfEb0A 1 1  671583 0  2.2gb  2.2gb
yellow open demo      -R1gna0xQkKDQBzhBopAmg 1 1       1 0   12kb   12kb
yellow open bill_text WynB9PhVQySS6KkLz7fc4Q 1 1 1102418 0 12.1gb 12.1gb



### Deleting

+ Specific document

In [98]:
es.delete(index='demo', id='1')

{'_index': 'demo',
 '_type': '_doc',
 '_id': '1',
 '_version': 2,
 'result': 'deleted',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 1,
 '_primary_term': 1}

In [99]:
print(es.cat.indices())

yellow open bill_meta mg70Dj99TbaTPDWiOfEb0A 1 1  671583 0  2.2gb  2.2gb
yellow open demo      -R1gna0xQkKDQBzhBopAmg 1 1       0 2 19.5kb 19.5kb
yellow open bill_text WynB9PhVQySS6KkLz7fc4Q 1 1 1102418 0 12.1gb 12.1gb



* All documents in an index

In [100]:
es.delete_by_query(index=['demo'], body={"query": {"match_all": {}}})

{'took': 1,
 'timed_out': False,
 'total': 0,
 'deleted': 0,
 'batches': 0,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

In [101]:
print(es.cat.indices())

yellow open bill_meta mg70Dj99TbaTPDWiOfEb0A 1 1  671583 0  2.2gb  2.2gb
yellow open demo      -R1gna0xQkKDQBzhBopAmg 1 1       0 2 19.5kb 19.5kb
yellow open bill_text WynB9PhVQySS6KkLz7fc4Q 1 1 1102418 0 12.1gb 12.1gb



* Deleting the index

In [102]:
es.indices.delete(index=["demo"])

{'acknowledged': True}

In [103]:
print(es.cat.indices())

yellow open bill_meta mg70Dj99TbaTPDWiOfEb0A 1 1  671583 0  2.2gb  2.2gb
yellow open bill_text WynB9PhVQySS6KkLz7fc4Q 1 1 1102418 0 12.1gb 12.1gb



### Searching

In [104]:
def print_matches(res, index):
    matches = res['hits']['total']['value']
    max_score = res['hits']['max_score']
    print("Total hits: {}\nMax score: {}\n".format(matches, max_score))
    
    if matches > 10:
        elements = 10
    else:
        elements = matches
    
    for i in range(elements):
        score = res['hits']['hits'][i]['_score']
        bill_id = res['hits']['hits'][i]['_source']['bill_id']
        if (index == "text"):
            bill_text = res['hits']['hits'][i]['_source']['doc']
        else:
            bill_text = res['hits']['hits'][i]['_source']['title']
        print("Score: {}, Bill_ID: {}, Related_text: {}".format(score, bill_id, bill_text))

In [105]:
def search_es(es_object, index_name, search_query):
    return es_object.search(index=index_name, body=search_query)

#### Keywords

JSON example of a "bill_text" index

In [55]:
aux = {
    "doc_id": 904957,
    "date": "2013-07-12",
    "type": "Introduced",
    "type_id": 1,
    "mime": "application/pdf",
    "mime_id": 2,
    "url": "https://legiscan.com/AL/text/SB4/id/904957",
    "state_link": "http://alisondb.legislature.state.al.us/ALISON/SearchableInstruments/2014RS/PrintFiles/SB4-int.pdf",
    "text_size": 29778,
    "bill_id": 560565,
    "doc": "1 SB4 2 144381-2 3 By Senator Dial 4 RFD: Finance and Taxation General Fund  5 First Read: 14-JAN-14  6 PFD: 06/07/2013   Page 0 1 144381-2:n:06/07/2013:LLR/tan LRS2012-4050R1 2   3   4   5   6   7   8 SYNOPSIS:          This bill would provide that all 9 settlements or orders entering judgment in a class 10 action lawsuit filed in Alabama and governed by 11 Alabama law that results in the creation of a 12 common fund for the benefit of a class shall 13 establish a plan for determining the distribution 14 of the residual remaining funds to the Department 15 of Child Abuse and Neglect Prevention and the 16 timing of such distribution. 17   18 A BILL 19 TO BE ENTITLED 20 AN ACT 21   22 Relating to the distribution of excess funds created 23 pursuant to a class action lawsuit filed in Alabama; to 24 provide that all settlements or orders entering judgment in a 25 class action lawsuit filed in Alabama and governed by Alabama 26 law that result in the creation of a common fund for the 27 benefit of a class shall establish a plan for determining the Page 1 1 distribution of the residual remaining funds to the Department 2 of Child Abuse and Neglect Prevention and the timing of such 3 distribution. 4 BE IT ENACTED BY THE LEGISLATURE OF ALABAMA: 5 Section 1. In any class action governed exclusively 6 by Alabama law, any order of a circuit court entering a full 7 and final judgment pursuant to Ala. R. Civ. p. 54 as to all 8 claims and issues, whether by settlement or other 9 adjudication, that results in the creation of a common fund 10 for the benefit of the class, shall establish a specific plan 11 for the distribution of any residual funds, under which any 12 funds remaining after payment of all benefits to or for the 13 benefit of class members shall be paid to the Department of 14 Child Abuse and Neglect Prevention. 15 Section 2. For the purpose of this act, residual 16 funds from a class action common fund are those funds that 17 remain undistributed due to circumstances in which the members 18 of the class cannot be located, funds for which the direct 19 distribution to individual class members is not economically 20 feasible, funds which remain after all class members are given 21 a full opportunity to make a claim, or funds payable to or 22 otherwise for the benefit of class members which for any 23 reason remain undistributed after the time period established 24 by the circuit court presiding over the action. All residual 25 funds shall be distributed to the Department of Child Abuse 26 and Neglect Prevention consistent with a specific plan as 27 provided in Section 1. Page 2 1 Section 3. Nothing in this act is intended to be nor 2 shall be construed so as to limit the rights of parties to a 3 class action to contract in settlement for the reversion of 4 residual funds to the paying party or to one or more persons 5 or entities designated by the circuit court or a class member 6 as a beneficiary or assignee of the rights of a class member. 7 Section 4. Notwithstanding the obligations imposed 8 by Section 1, in the event the circuit court presiding over a 9 class action finds that, except with regard to the obligations 10 imposed by Section 1, parties to a class action have reached a 11 settlement on behalf of or for the benefit of class member, 12 the circuit court may intervene and attempt to assist the 13 parties' efforts in reaching a resolution or settlement of the 14 class action. In the event the circuit court undertakes to 15 perform duties under this section, the circuit court, in its 16 discretion, and upon determination by the circuit court that 17 the obligations imposed by Section 1 are a substantial 18 impediment to settlement, may suspend the requirements of 19 Section 1. 20 Section 5. This act shall apply to all cases filed 21 on or after January 1, 2015. 22 Section 6. This act shall become effective 23 immediately following its passage and approval by the 24 Governor, or its otherwise becoming law. Page 3"
}

10,000 hits by default

In [106]:
search_query = {'query': {'match': {'doc': 'health'}}}

res = search_es(es, "bill_text", search_query)
print_matches(res, "text")

Total hits: 10000
Max score: 2.946524

Score: 2.946524, Bill_ID: 240130, Related_text: REFERENCE TITLE: insurance; mental health coverage;
  parity
State of Arizona
House of Representatives
Fiftieth Legislature
First Regular Session
2011
HB 2128
Introduced by
Representative
  Patterson
AN ACT
Amending section
20‑2322, Arizona Revised Statutes; relating to accountable health plans.
(TEXT OF BILL BEGINS ON NEXT PAGE)
Be it enacted by the Legislature of the State of Arizona:
Section 1.  Section 20-2322, Arizona Revised
Statutes, is amended to read:
START_STATUTE20-2322.  Mental health services and benefits;
definitions
A.  Beginning on January 1, 1998, any health benefits
plan that is offered by an accountable health plan and that provides services
or health benefits that include mental health services or mental health
benefits shall comply with this section.
B.  If the health benefits plan does not include an
aggregate lifetime limit on substantially all health services or healt

In [23]:
search_query = {'query': {'match': {'title': 'reproductive rights'}}}

res = search_es(es, "bill_meta", search_query)
print_matches(res, "meta")

Total hits: 6109
Max score: 19.12119

Score: 19.12119, Bill_ID: 386628, Related_text: REPRODUCTIVE RIGHTS AWARENESS
Score: 18.350048, Bill_ID: 683068, Related_text: Women's reproductive rights; support
Score: 18.350048, Bill_ID: 403703, Related_text: Reproductive Rights Awareness Week
Score: 17.452572, Bill_ID: 1132662, Related_text: Reproductive Rights are Human Rights Act of 2018
Score: 12.65897, Bill_ID: 972588, Related_text: Reproductive Issues
Score: 12.126989, Bill_ID: 737480, Related_text: Reproductive FACT Act.
Score: 12.126989, Bill_ID: 789713, Related_text: Women's reproductive health.
Score: 12.126989, Bill_ID: 834665, Related_text: Reproductive Health Amendments
Score: 12.126989, Bill_ID: 961575, Related_text: Discrimination: reproductive health.
Score: 12.126989, Bill_ID: 1075988, Related_text: Reproductive Education Amendments


In [32]:
search_query = {'query': {'match': {'sponsors.name': 'david'}}}

res = search_es(es, 'bill_meta', search_query)
print_matches(res, "meta")

Total hits: 10000
Max score: 3.825032

Score: 3.825032, Bill_ID: 503270, Related_text: To adequately compensate career prosecutors
Score: 3.825032, Bill_ID: 450876, Related_text: Expands the products which may be purchased for public use by including renewable energy resources; increases the cost premium percentage for renewable energy generated in New York.
Score: 3.825032, Bill_ID: 580934, Related_text: Includes "electronic cigarettes" within the definition of smoking.
Score: 3.825032, Bill_ID: 751075, Related_text: To adequately compensate career prosecutors
Score: 3.825032, Bill_ID: 666234, Related_text: Cities and towns; increasing timing and number of notices prior to hearing on urban renewal plan; effective date.
Score: 3.825032, Bill_ID: 693992, Related_text: Cities and towns; authorizing collection of unpaid solid waste accounts. Effective date.
Score: 3.825032, Bill_ID: 625048, Related_text: Imposes criminal penalties for discharges of untreated sewage from containment device

#### Fuzzy

In [33]:
search_query = {'query': {'match': {'sponsors.name': {'query': 'dave', 'fuzziness': 'AUTO'}}}}

res = search_es(es, 'bill_meta', search_query)
print_matches(res, "meta")

Total hits: 10000
Max score: 10.588934

Score: 10.588934, Bill_ID: 1050036, Related_text: VETERANS MEMORIAL OVERPASS
Score: 10.588934, Bill_ID: 1007811, Related_text: VETERANS MEMORIAL OVERPASS
Score: 10.588934, Bill_ID: 1052013, Related_text: TIF-CITY OF MARION
Score: 10.588934, Bill_ID: 1213438, Related_text: State general tax modified.
Score: 9.632697, Bill_ID: 1000229, Related_text: Dial, Charles ''Ron"; condolences
Score: 9.632697, Bill_ID: 1091590, Related_text: LANDOWNER GRANT PROGRAM
Score: 9.632697, Bill_ID: 1107925, Related_text: Compensation to businesses provided for loss of business opportunity from sale and closure of biomass energy plant, account created, and money transferred.
Score: 9.632697, Bill_ID: 1105498, Related_text: Deputy registrar reimbursement funding provided, and money appropriated.
Score: 9.632697, Bill_ID: 1105507, Related_text: Motor vehicle services governed, and powers of deputy registrars relating to handling transactions broadened.
Score: 9.632697, 

#### Phrase search

In [35]:
search_query = {'query': {'match_phrase': {'doc': 'reproductive rights'}}}

res = search_es(es, 'bill_text', search_query)
print_matches(res, "text")

Total hits: 56
Max score: 16.469149

Score: 16.469149, Bill_ID: 495050, Related_text: As Introduced 130th General AssemblyRegular Session2013-2014S. C. R. No. 3
Senator Tavares 
Cosponsors: 
Senators Cafaro, Brown, Turner, Gentile, Skindell, Smith 
A CONCURRENT RESOLUTION To recognize the week of January 20-26, 2013, as 1
Reproductive Rights Awareness Week to encourage 2
public awareness, conversation, and support for 3
reproductive rights and justice.4
 BE IT RESOLVED BY THE SENATE OF THE STATE OF OHIO(THE HOUSE OF REPRESENTATIVES CONCURRING):        WHEREAS, Women comprise more than half of the population of 5
the United States of America and are solely responsible for 6
childbearing; and7        WHEREAS, Women who plan their pregnancies are more likely to 8
seek prenatal care, improving their own health and the health of 9
their children; and10        WHEREAS, According to the Center for Women Policy Studies' 11
2011 National Strategic Action Convening for State Legislators on 12
Re

### Hightlighting

In [36]:
search_query = {'query': {'match': {'doc': 'reproductive rights'}}, 
                'highlight': {'fields': {'doc': { 'type': 'plain'}}}}

In [37]:
res = search_es(es, "bill_text", search_query)

In [39]:
res['hits']['hits'][0]['highlight']

{'doc': [" threat to women's <em>reproductive</em> <em>rights</em>; and\nWhereas,\r\nThe protections affirmed by the Supreme Court in",
  " judges\r\nwho are likely to impose limits on women's <em>rights</em> and their access to\r\n<em>reproductive</em> health",
  ' the <em>rights</em> of women to access the full panoply of\r\n<em>reproductive</em> services, including abortion.\n4',
  ' <em>reproductive</em> <em>rights</em> and access to\r\nabortion, and it respectfully urges the Congress and President of',
  ' <em>reproductive</em>\r\n<em>rights</em> and health of women in the State and nation.']}

You can easily change this: `<em>something in here</em>` into this: 

threat to women's <mark>reproductive</mark> <mark>rights</mark>; and\nWhereas,\r\nThe protections affirmed by the Supreme Court in",
  " judges\r\nwho are likely to impose limits on women's <mark>rights</mark> and their access to\r\n<mark>reproductive</mark> health",
  ' the <mark>rights</mark> of women to access the full panoply of\r\n<mark>reproductive</mark> services, including abortion.\n4',
  ' <mark>reproductive</mark> <mark>rights</mark> and access to\r\nabortion, and it respectfully urges the Congress and President of',
  ' <mark>reproductive</mark>\r\n<mark>rights</mark> and health of women in the State and nation.'

### Aggregations

In [57]:
from elasticsearch_dsl import Search, A

In [87]:
def print_aggs(res):
    aggs = res['aggregations']['group_by_state']['buckets']
    
    for element in aggs:
        state = element['key']
        count = element['doc_count']
        print("State: {}, Docs: {}".format(state, count)) 

In [88]:
search_query = {"size": 0, 
                "aggs": {"group_by_state": {"terms": {"field": "state.keyword"}}}
               }

In [89]:
res = search_es(es, 'bill_meta', search_query)
print_aggs(res)

State: NY, Docs: 56579
State: IL, Docs: 40657
State: TX, Docs: 35245
State: NJ, Docs: 30451
State: HI, Docs: 28448
State: MN, Docs: 26646
State: US, Docs: 24778
State: OK, Docs: 24372
State: MA, Docs: 21972
State: PA, Docs: 20305
