In [1]:
import re
import numpy as np
import pandas as pd
from datetime import datetime
from elasticsearch import Elasticsearch
import json
import os

es = Elasticsearch()


# Demo ElasticSearch quering
Some simple elastic search quering for demo purposes.

In [33]:
""" Query combination """
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import MultiMatch, Match
from elasticsearch_dsl import Q

def exportJson(j, outfile):
    with open(outfile, "w") as data_file:
        json.dump(j, data_file, indent=2)

m1 = Q("match", text='piet') & Q("match", activist="true") & Q("match", zp="true")

s = Search().using(es).query(m1)[0:20]   # Filter out stuff e.g. negative sentiment .exclude("match", sentiment="negative")
response = s.execute()
exportJson(response.to_dict(),"res.json")
print()
print('Query combination results:')
for hit in response: print(hit.meta.score, hit.text, hit.user.location, hit.user.isNews)


Query combination results:
4.0077915 @albertheijn Liever zwarte Piet  False
4.000101 RT @Co0ontje: Piet! Piet! https://t.co/fJAWVnhzPY Nederlands Neanderthalië False
3.9649076 @LavieJanRoos Jan, Piet, Joris en Corneel  False
3.9649076 @Isdatnouwelzo @nrc Ze willen ook geen Piet met een migratieachtergrond.  False
3.8890626 Piet. Ouwe viezerik. #FirstDates  False
3.8537278 ZWARTE PIET WIE KENT HEM NIET ! https://t.co/970VQdsrvL Amsterdam, Nederland False
3.8293254 RT @trouw: Gesloopte Piet, wie ziet hem niet? Een nieuw middel in de strijd van activisten tegen Zwarte Piet is opgedoken: het fijnknijpen… Weert, Nederland False
3.8166769 Zwarte Piet is morgen weer vertrokken. Het racisme blijft. Zwarte Piet en institutioneel racisme onlosmakelijk met… https://t.co/ciJna1AFdP Rotterdam False
3.7520223 @frgroenendijk Als Zwarte Piet lacht dan lacht ie ook echt. The Netherlands False
3.723523 RT @Marssieboy: Zwarte Piet krenkt tegenwoordig. Vroeger gaf hij snoep en pepernoten, nu geeft hij no

In [57]:
""" This is the typical ES quering. It would be easier with es-dsl."""
print()
body = {
    "query" : {
        "match" : {"text" : "piet"}
    },
    "aggregations" : {
        "my_sample" : {
            "sampler" : {
                "shard_size" : 100000
            },
            "aggregations": {
                "keywords" : {
                    "significant_text" : { "field" : "text" }
                }
            }
        }
    }
}

res = es.search(index="zpiet-with_status_count-tweets-index", body = body, size=80)
with open("res.json", "w") as data_file:
    json.dump(res, data_file, indent=2)

print("Got %d Hits:" % res['hits']['total']['value'])
print("Got %d Buckets:" % len(res['aggregations']['my_sample']["keywords"]["buckets"]))
for i,hit in enumerate(res['aggregations']['my_sample']["keywords"]["buckets"]):
    #print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])
    print(hit)


Got 279 Hits:
Got 10 Buckets:
{'key': 'piet', 'doc_count': 279, 'score': 3.301075268817204, 'bg_count': 279}
{'key': 'zwarte', 'doc_count': 248, 'score': 2.5464304884594733, 'bg_count': 276}
{'key': 'kick', 'doc_count': 79, 'score': 0.9046422741133754, 'bg_count': 81}
{'key': 'out', 'doc_count': 79, 'score': 0.8487459731262144, 'bg_count': 85}
{'key': 'anti', 'doc_count': 37, 'score': 0.2980888042579733, 'bg_count': 49}
{'key': 'rt', 'doc_count': 171, 'score': 0.2376259006930674, 'bg_count': 530}
{'key': 'bus', 'doc_count': 16, 'score': 0.1747997615047869, 'bg_count': 17}
{'key': 'politie', 'doc_count': 28, 'score': 0.15143690343135366, 'bg_count': 48}
{'key': 'demonstranten', 'doc_count': 15, 'score': 0.1502723878313032, 'bg_count': 17}
{'key': 'demonstreren', 'doc_count': 13, 'score': 0.13949864834360154, 'bg_count': 14}


In [45]:
""" Query combination and Aggregations with DSL"""
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import MultiMatch, Match
from elasticsearch_dsl import Q
from elasticsearch_dsl import A

def exportJson(j, outfile):
    with open(outfile, "w") as data_file:
        json.dump(j, data_file, indent=2)

m1 = Q("multi_match",fields = ['text'] , query='zwarte piet', type="phrase" ) & Q("match", activist="false") & Q("match", zp="true")
a = A('significant_text', field='text')

s = Search() #.aggs.bucket("keywords", a).using(es).query(m1)[0:20]
s = s.using(es).query(m1)[0:3000] 
s.aggs.bucket("my_sample", a)
print(s.to_dict())
  # Filter out stuff e.g. negative sentiment .exclude("match", sentiment="negative")
res = s.execute()
exportJson(res.to_dict(),"res.json")

print("Got %d Hits:" % res['hits']['total'])
print("Got %d Buckets:" % len(res['aggregations']['my_sample']["buckets"]))
for i,hit in enumerate(res['aggregations']['my_sample']["buckets"]):
    #print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])
    print(hit)

{'query': {'bool': {'must': [{'multi_match': {'fields': ['text'], 'query': 'zwarte piet', 'type': 'phrase'}}, {'match': {'activist': 'false'}}, {'match': {'zp': 'true'}}]}}, 'aggs': {'my_sample': {'significant_text': {'field': 'text'}}}, 'from': 0, 'size': 3000}
Got 116 Hits:
Got 10 Buckets:
{'key': 'zwarte', 'doc_count': 116, 'score': 28.325757575757...}
{'key': 'piet', 'doc_count': 116, 'score': 27.32439024390244...}
{'key': 'demonstreren', 'doc_count': 8, 'score': 5.454458977...}
{'key': 'kick', 'doc_count': 18, 'score': 5.437294887039239,...}
{'key': 'out', 'doc_count': 18, 'score': 5.023038049940548, ...}
{'key': 't.co', 'doc_count': 47, 'score': 4.747380250345469,...}
{'key': 'https', 'doc_count': 47, 'score': 4.572489428847294...}
{'key': 'tijdens', 'doc_count': 5, 'score': 4.27207193816884...}
{'key': 'anti', 'doc_count': 14, 'score': 4.0050390069893576...}
{'key': '18', 'doc_count': 5, 'score': 3.5528760404280626, '...}
