# Zwarte Piet tweets dataset
Read and index with ElasticSearch the local json dataset. Various trimming and cleaning is performed to minimize the index size. After indexing, we perform some simple queries using the elasticsearch-dsl library. The script assumes that the local json datasets are stored in a folder named "Data".

In [1]:
import re
import numpy as np
import pandas as pd
from datetime import datetime
from elasticsearch import Elasticsearch
import json
import os

es = Elasticsearch()


First we define which fields from the original dataset json files would be included in the ES database. The original set contains more than 1000 fields per item/tweet; most of which are not interesting to us. Each tweet comes with a user field as well, containing the info of the user the posted the tweet. The user field shoule also be truncated.

In [2]:
include_keys = ["created_at","id","text","user","retweet_count",
                "favorite_count","lang","activist","zp"]
include_user_keys  = ["id","location","description","followers_count",
                      "friends_count","listed_count","favourites_count","verified","lang","statuses_count","sentiment"]

We now proceed with indexing the "filtered" dataset. First, we read the local json files.  This might take time to complete especially if all datasets are used. As such, for demo purposes we set NUM (num of json files) set to 1 and TWEETSPERDATA is set to 100. For the final version, all the data should be used.

In [15]:
""" Filtered """
TWEETSPERDATA = 500
NUM = 3

def readJsonFolder(folder):
    """ 
        Function to read jsons from a folder. Each json 
        contains multiple items. All items are concatenated
        into a single output json file.
    """
    print('Reading folder %s'%folder)
    LOC = folder
    data = []
    for file in os.listdir(LOC)[:NUM]:
        print(file)
        with open(LOC+"/"+file) as json_file: 
            try:
                data += json.load(json_file)
            except:
                print("%s skipped"%file)
    return data

act_zp = readJsonFolder('Data/with_status_count/activist/zp')
act_nzp = readJsonFolder('Data/with_status_count/activist/non_zp')
ver_zp = readJsonFolder('Data/with_status_count/verified/zp')
ver_nzp = readJsonFolder('Data/with_status_count/verified/non_zp')
print("Done!")

Reading folder Data/with_status_count/activist/zp
zp_filter_wstatusactivist_piet90.json
zp_filter_wstatusactivist_piet80.json
zp_filter_wstatusactivist_piet40.json
Reading folder Data/with_status_count/activist/non_zp
zp_filter_wstatusactivist_no_piet30.json
zp_filter_wstatusactivist_no_piet10.json
zp_filter_wstatusactivist_no_piet110.json
Reading folder Data/with_status_count/verified/zp
zp_filter_wstatusverified_piet40.json
zp_filter_wstatusverified_piet60.json
zp_filter_wstatusverified_piet20.json
Reading folder Data/with_status_count/verified/non_zp
zp_filter_wstatusverified_no_piet10.json
zp_filter_wstatusverified_no_piet30.json
zp_filter_wstatusverified_no_piet70.json
Done!


The tweets need to be enriched with expert sentiment annotations. We read the annotations from the csv files.

In [4]:
import pandas as pd

annotations_act = pd.read_csv("Data/zpiet-activist-sentiment.csv", sep = ";",  header='infer')
annotations_act.head()

annotations_ver = pd.read_csv("Data/zpiet-sentiment-verified.csv", sep = ";",  header='infer')
annotations_ver.head()


Unnamed: 0,All verified accounts,Catarina,Thirza,Combined,Unnamed: 4
0,SaskiaBelleman,Neutral,neutral,Neutral,
1,umarebru,Pro,irrelevant,Pro,
2,OmropFryslan,Neutral,irrelevant,Neutral,
3,AlexanderNL,Con,irrelevant,Con,
4,YarnoRitzen,Con,con,Con,


The indexing part in ES is tricky. Throwing everything in ES without taking care of certain fields' particular properties will not allow us to answer certain research questions (e.g. wordclouds). So here's what we do:
- The **"text"** field is the most important. When queried, we want it to be tokenized and treated as a collection of words (instead of the typical __keyword__ type). We also want to remove any Dutch stopwords. To achieve these, we define our own **analyzer** when creating the index. And we assign the **text** to be of text type. Its "fielddata" is set to true so that it's indexable and aggregable.
- We incorporate the folder names e.g. activist, non_zp as additional fields in the json files. If a json file comes from the activist folder then its activist field is set to true. If it comes from the verified folder, the its activist field is set to false.

In [16]:
# Deletion of the index if it already exists
es.indices.delete(index="zpiet-with_status_count-tweets-index", ignore=[400, 404])

# Create index with settings; for more check https://github.com/elastic/elasticsearch-py/blob/master/example/load.py
index_body = {
  "settings": {
    "analysis": {
      "analyzer": {
        "my_dutch_analyzer": {
          "type": "standard",         
          "stopwords": "_dutch_"
        }
      }
    },
      "index.mapping.total_fields.limit":2000
  },
 "mappings":{
    "tweet":{
        "properties": {
          "text": {"type": "text","analyzer": "my_dutch_analyzer","fielddata":"true"} # fielddata is needed for term search, wordclouds
         }  
    }
}
}

print("Indexing...")
es.indices.create(index="zpiet-with_status_count-tweets-index",body=index_body)
i=0
for dataset,act,zp in [(act_zp,True,True), (act_nzp,True,False), (ver_zp,False,True),(ver_nzp, False,False)]:
    print("Activist: %d, ZP: %d, size: %d"%(act,zp,len(dataset)))
    for doc in dataset[:min(len(dataset),TWEETSPERDATA)]:
        doc.update({"activist":act})
        doc.update({"zp":zp})
        
        # Delete unwanted keys
        to_delete = []
        for key in doc.keys():
            if key not in include_keys: to_delete.append(key)
        for key in to_delete: doc.pop(key, None)
        
        # Enrich with sentiment for the user
        name = doc["user"]["screen_name"]
        #print(doc["user"])
        #break
        combined_sentiment = "unknown"
        sent = annotations_act.loc[annotations_act['random sample of non-verified accounts'] == name]["Combined"].values
        if len(sent)>0: combined_sentiment = sent[0]
        sent = annotations_ver.loc[annotations_ver['All verified accounts'] == name]["Combined"].values
        if len(sent)>0: combined_sentiment = sent[0]
        
        doc["user"].update({"sentiment":combined_sentiment})
            
        # Delete unwanted keys in user key
        to_delete = []
        for key in doc["user"].keys():
            if key not in include_user_keys: to_delete.append(key)
        for key in to_delete: doc["user"].pop(key, None)
        
        # Index
        try:
            res = es.index(index="zpiet-with_status_count-tweets-index", doc_type='tweet', id=i, body=doc)
        except:
            print("Document not indexed probably due to sentiment value: ",combined_sentiment)
        i+=1
print("Done!")

Indexing...
Activist: 1, ZP: 1, size: 3111
Activist: 1, ZP: 0, size: 30392
Activist: 0, ZP: 1, size: 391
nan
nan
nan
nan
nan
nan
nan
nan
Activist: 0, ZP: 0, size: 33218
nan
Done!


# ElasticSearch quering
Some simple elastic search quering for demo purposes.

In [17]:
""" Query combination """
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import MultiMatch, Match
from elasticsearch_dsl import Q

def exportJson(j, outfile):
    with open(outfile, "w") as data_file:
        json.dump(j, data_file, indent=2)

m1 = Q("match", text='piet') & Q("match", activist="true") & Q("match", zp="true")

s = Search().using(es).query(m1)[0:20]   # Filter out stuff e.g. negative sentiment .exclude("match", sentiment="negative")
response = s.execute()
exportJson(response.to_dict(),"res.json")
print()
print('Query combination results:')
for hit in response: print(hit.meta.score, hit.text, hit.user.location)


Query combination results:
3.8175476 Anti-zwarte piet. Niemand is tegen een piet zonder de racistische blackface schmink https://t.co/kWIL73tQ4Q Amsterdam, NL
3.594859 RT @rzuaslan: Niet anti-piet demonstrant @Politie_Rdam, maar anti-zwarte piet demonstrant. https://t.co/OKmNF3AtLR Rotterdam
3.5371408 RT @Co0ontje: Piet! Piet! https://t.co/fJAWVnhzPY Nederlands Neanderthalië
3.4777672 @OekelSjef @Burgemeester17 Sinterklaas op een roetveeg piet! Racisme van zebra's! Nederland
3.4526117 @Ruijgrok020 @rickdus Als een Zwarte Piet die wel werkt! Nederland
3.398516 Foto’s! Zwarte Piet is nog altijd ZWART in Leeuwarden https://t.co/6J7DPS90nd Nederland
3.398516 RT @nol_123: zo zijn er nog anti piet figuren over de vangrail geknikkerd #dTV 
3.398516 RT @EWdeVlieger: Zwarte Piet is cynisme https://t.co/yZxpZkGcHi 
3.371166 @annefleurdd Je bedoelt ‘JIJ VINDT dat zwarte piet racisme is. 
3.3660238 @jndkgrf Het ultieme bewijs dat de Anti-Piet zooi kan oprotten !!
en een beetje snel graag.
OPROTTE

In [57]:
""" This is the typical ES quering. It would be easier with es-dsl."""
print()
body = {
    "query" : {
        "match" : {"text" : "piet"}
    },
    "aggregations" : {
        "my_sample" : {
            "sampler" : {
                "shard_size" : 100000
            },
            "aggregations": {
                "keywords" : {
                    "significant_text" : { "field" : "text" }
                }
            }
        }
    }
}

res = es.search(index="zpiet-with_status_count-tweets-index", body = body, size=80)
with open("res.json", "w") as data_file:
    json.dump(res, data_file, indent=2)

print("Got %d Hits:" % res['hits']['total']['value'])
print("Got %d Buckets:" % len(res['aggregations']['my_sample']["keywords"]["buckets"]))
for i,hit in enumerate(res['aggregations']['my_sample']["keywords"]["buckets"]):
    #print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])
    print(hit)


Got 279 Hits:
Got 10 Buckets:
{'key': 'piet', 'doc_count': 279, 'score': 3.301075268817204, 'bg_count': 279}
{'key': 'zwarte', 'doc_count': 248, 'score': 2.5464304884594733, 'bg_count': 276}
{'key': 'kick', 'doc_count': 79, 'score': 0.9046422741133754, 'bg_count': 81}
{'key': 'out', 'doc_count': 79, 'score': 0.8487459731262144, 'bg_count': 85}
{'key': 'anti', 'doc_count': 37, 'score': 0.2980888042579733, 'bg_count': 49}
{'key': 'rt', 'doc_count': 171, 'score': 0.2376259006930674, 'bg_count': 530}
{'key': 'bus', 'doc_count': 16, 'score': 0.1747997615047869, 'bg_count': 17}
{'key': 'politie', 'doc_count': 28, 'score': 0.15143690343135366, 'bg_count': 48}
{'key': 'demonstranten', 'doc_count': 15, 'score': 0.1502723878313032, 'bg_count': 17}
{'key': 'demonstreren', 'doc_count': 13, 'score': 0.13949864834360154, 'bg_count': 14}


In [45]:
""" Query combination and Aggregations with DSL"""
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import MultiMatch, Match
from elasticsearch_dsl import Q
from elasticsearch_dsl import A

def exportJson(j, outfile):
    with open(outfile, "w") as data_file:
        json.dump(j, data_file, indent=2)

m1 = Q("multi_match",fields = ['text'] , query='zwarte piet', type="phrase" ) & Q("match", activist="false") & Q("match", zp="true")
a = A('significant_text', field='text')

s = Search() #.aggs.bucket("keywords", a).using(es).query(m1)[0:20]
s = s.using(es).query(m1)[0:3000] 
s.aggs.bucket("my_sample", a)
print(s.to_dict())
  # Filter out stuff e.g. negative sentiment .exclude("match", sentiment="negative")
res = s.execute()
exportJson(res.to_dict(),"res.json")

print("Got %d Hits:" % res['hits']['total'])
print("Got %d Buckets:" % len(res['aggregations']['my_sample']["buckets"]))
for i,hit in enumerate(res['aggregations']['my_sample']["buckets"]):
    #print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])
    print(hit)

{'query': {'bool': {'must': [{'multi_match': {'fields': ['text'], 'query': 'zwarte piet', 'type': 'phrase'}}, {'match': {'activist': 'false'}}, {'match': {'zp': 'true'}}]}}, 'aggs': {'my_sample': {'significant_text': {'field': 'text'}}}, 'from': 0, 'size': 3000}
Got 116 Hits:
Got 10 Buckets:
{'key': 'zwarte', 'doc_count': 116, 'score': 28.325757575757...}
{'key': 'piet', 'doc_count': 116, 'score': 27.32439024390244...}
{'key': 'demonstreren', 'doc_count': 8, 'score': 5.454458977...}
{'key': 'kick', 'doc_count': 18, 'score': 5.437294887039239,...}
{'key': 'out', 'doc_count': 18, 'score': 5.023038049940548, ...}
{'key': 't.co', 'doc_count': 47, 'score': 4.747380250345469,...}
{'key': 'https', 'doc_count': 47, 'score': 4.572489428847294...}
{'key': 'tijdens', 'doc_count': 5, 'score': 4.27207193816884...}
{'key': 'anti', 'doc_count': 14, 'score': 4.0050390069893576...}
{'key': '18', 'doc_count': 5, 'score': 3.5528760404280626, '...}
