## Boolean Model and Elasticsearch

Run these codes using Google Colaboratory

### Import libraries

In [None]:
import os
import time
import json

### Download Elasticsearch


In [None]:
%%bash

wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.10.2-linux-x86_64.tar.gz
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.10.2-linux-x86_64.tar.gz.sha512

In [None]:
%%bash

tar -xzf elasticsearch-oss-7.10.2-linux-x86_64.tar.gz

In [None]:
%%bash

sudo chown -R daemon:daemon elasticsearch-7.10.2

In [None]:
%%bash
shasum -a 512 elasticsearch-oss-7.10.2-linux-x86_64.tar.gz.sha512

d5d1a4189312b751759a826b093d4b3c16d5a2f9723af16d19269271ea7c297a8b7ddbb439cde85bfa7f988201363cc824bd2622d813092283e9894dc9e1c82a  elasticsearch-oss-7.10.2-linux-x86_64.tar.gz.sha512


### Menjalankan elasticsearch

Set agar elasticsearch berjalan di background dengan command `--bg`

In [None]:
%%bash --bg

sudo -H -u daemon elasticsearch-7.10.2/bin/elasticsearch

Starting job # 0 in a separate thread.


In [None]:
# Sleep untuk memastikan servis elasticnya betul-betul sudah jalan sebelum diutak-atik

time.sleep(20)

In [None]:
# Cek bahwa elasticsearchnya sudah jalan

!ps -ef | grep elasticsearch

root         128     126  0 04:36 ?        00:00:00 sudo -H -u daemon elasticsearch-7.10.2/bin/elasticsearch
daemon       129     128 33 04:36 ?        00:00:20 /content/elasticsearch-7.10.2/jdk/bin/java -Xshare:auto -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -XX:+ShowCodeDetailsInExceptionMessages -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dio.netty.allocator.numDirectArenas=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.locale.providers=SPI,COMPAT -Xms1g -Xmx1g -XX:+UseG1GC -XX:G1ReservePercent=25 -XX:InitiatingHeapOccupancyPercent=30 -Djava.io.tmpdir=/tmp/elasticsearch-6494169591416233246 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filec

In [None]:
%%bash

curl -X GET "localhost:9200"

{
  "name" : "95629034e8bb",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "Na_L5P81RN-lUitfK05pFA",
  "version" : {
    "number" : "7.10.2",
    "build_flavor" : "oss",
    "build_type" : "tar",
    "build_hash" : "747e1cc71def077253878a59143c1f785afa92b9",
    "build_date" : "2021-01-13T00:42:12.435326Z",
    "build_snapshot" : false,
    "lucene_version" : "8.7.0",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   534  100   534    0     0   6592      0 --:--:-- --:--:-- --:--:--  6512100   534  100   534    0     0   6592      0 --:--:-- --:--:-- --:--:--  6512


### Membuat index `myindex` di elasticsearch

In [None]:
%%bash

# Instansiasi index

curl -X PUT "localhost:9200/myindex"

{"acknowledged":true,"shards_acknowledged":true,"index":"myindex"}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100    66  100    66    0     0     78      0 --:--:-- --:--:-- --:--:--    78


In [None]:
%%bash

# Cek apakah index betul-betul sudah terbuat

curl -X GET "localhost:9200/myindex?pretty"

{
  "myindex" : {
    "aliases" : { },
    "mappings" : { },
    "settings" : {
      "index" : {
        "creation_date" : "1637555894125",
        "number_of_shards" : "1",
        "number_of_replicas" : "1",
        "uuid" : "-Bm0-OqEQ-mbMZtjB4bnAg",
        "version" : {
          "created" : "7100299"
        },
        "provided_name" : "myindex"
      }
    }
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   375  100   375    0     0  37500      0 --:--:-- --:--:-- --:--:-- 37500


### Membuat dokumen baru

In [None]:
%%bash

# Membuat (menambah) dokumen baru

curl -X PUT 'localhost:9200/myindex/article/1?pretty' -H "Content-Type: application/json" -d '{"text": "Saya suka makan nasi goreng."}'

{
  "_index" : "myindex",
  "_type" : "article",
  "_id" : "1",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 2,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 0,
  "_primary_term" : 1
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   264  100   224  100    40   1287    229 --:--:-- --:--:-- --:--:--  1517


In [None]:
%%bash

# Membuat 1 dokumen lagi

curl -X PUT "localhost:9200/myindex/article/2?pretty" -H "Content-Type: application/json" -d '{"text": "Saya tidak suka makan nasi padang"}'

{
  "_index" : "myindex",
  "_type" : "article",
  "_id" : "2",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 2,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 1,
  "_primary_term" : 1
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   269  100   224  100    45  11789   2368 --:--:-- --:--:-- --:--:-- 14157


Skor yang dihasilkan menggunakan formula BM25.

Parameterisasi BM25 bisa diubah-ubah --> https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules-similarity.html

### Search dalam elasticsearch menggunakan `query`

Dokumentasi lengkap tentang variasi query = https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html


In [None]:
%%bash

# search dengan menggunakan query mengandung nasi atau goreng

curl -X GET 'localhost:9200/myindex/article/_search?pretty' \
  -H 'Content-Type: application/json' -d '{"query": {"match": {"text": "nasi goreng"}}}'

{
  "took" : 1146,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 0.9092851,
    "hits" : [
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "1",
        "_score" : 0.9092851,
        "_source" : {
          "text" : "Saya suka makan nasi goreng."
        }
      },
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "2",
        "_score" : 0.17578414,
        "_source" : {
          "text" : "Saya tidak suka makan nasi padang"
        }
      }
    ]
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   741  100   696  100    45    597     38  0:00:01  0:00:01 --:--:--   636100   741  100   696  100    45    597     38  0:00:01  0:00:01 --:--:--   636


### Fuzzy Query

In [None]:
%%bash

# fuzzy query

curl -X GET 'localhost:9200/myindex/article/_search?pretty' -H 'Content-Type: application/json' -d '
{
  "query": {
    "match": {
      "text": {
        "query": "nasii gorenk",
        "fuzziness": "AUTO"
      }
    }
  }
}
'

{
  "took" : 38,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 0.7419573,
    "hits" : [
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "1",
        "_score" : 0.7419573,
        "_source" : {
          "text" : "Saya suka makan nasi goreng."
        }
      },
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "2",
        "_score" : 0.13183811,
        "_source" : {
          "text" : "Saya tidak suka makan nasi padang"
        }
      }
    ]
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   822  100   694  100   128  14765   2723 --:--:-- --:--:-- --:--:-- 17489


### Prefix Query

In [None]:
%%bash

# prefix query

curl -X GET 'localhost:9200/myindex/article/_search?pretty' -H 'Content-Type: application/json' -d '
{
  "query": {
    "prefix": {
      "text": {
        "value": "go"
      }
    }
  }
}
'

{
  "took" : 13,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "text" : "Saya suka makan nasi goreng."
        }
      }
    ]
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   557  100   467  100    90  17296   3333 --:--:-- --:--:-- --:--:-- 20629


### Wildcard Query

In [None]:
%%bash

# wildcard query

curl -X GET 'localhost:9200/myindex/article/_search?pretty' -H 'Content-Type: application/json' -d '
{
  "query": {
    "wildcard": {
      "text": {
        "value": "go*ng"
      }
    }
  }
}
'

{
  "took" : 4,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "text" : "Saya suka makan nasi goreng."
        }
      }
    ]
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   561  100   466  100    95  42363   8636 --:--:-- --:--:-- --:--:-- 51000


In [None]:
%%bash

# wildcard query

curl -X GET 'localhost:9200/myindex/article/_search?pretty' -H 'Content-Type: application/json' -d '
{
  "query": {
    "regexp": {
      "text": {
        "value": "go.*ng"
      }
    }
  }
}
'

{
  "took" : 8,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "text" : "Saya suka makan nasi goreng."
        }
      }
    ]
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   560  100   466  100    94  16642   3357 --:--:-- --:--:-- --:--:-- 20000


### Boolean Query

In [None]:
%%bash

# nasi AND goreng

curl -X GET 'localhost:9200/myindex/article/_search?pretty' -H 'Content-Type: application/json' -d '
{
  "query": {
    "bool": {
      "must": [
        {"term": {"text": "nasi"}},
        {"term": {"text": "goreng"}}
      ]
    }
  }
}
'

{
  "took" : 4,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.9092851,
    "hits" : [
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "1",
        "_score" : 0.9092851,
        "_source" : {
          "text" : "Saya suka makan nasi goreng."
        }
      }
    ]
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   617  100   478  100   139  34142   9928 --:--:-- --:--:-- --:--:-- 44071


In [None]:
%%bash

# nasi OR goreng

curl -X GET 'localhost:9200/myindex/article/_search?pretty' -H 'Content-Type: application/json' -d '
{
  "query": {
    "bool": {
      "should": [
        {"term": {"text": "nasi"}},
        {"term": {"text": "goreng"}}
      ]
    }
  }
}
'

{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 0.9092851,
    "hits" : [
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "1",
        "_score" : 0.9092851,
        "_source" : {
          "text" : "Saya suka makan nasi goreng."
        }
      },
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "2",
        "_score" : 0.17578414,
        "_source" : {
          "text" : "Saya tidak suka makan nasi padang"
        }
      }
    ]
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   834  100   693  100   141  77000  15666 --:--:-- --:--:-- --:--:-- 92666


In [None]:
%%bash

# NOT tidak

curl -X GET 'localhost:9200/myindex/article/_search?pretty' -H 'Content-Type: application/json' -d '
{
  "query": {
    "bool": {
      "must_not": [
        {"term": {"text": "tidak"}}
      ]
    }
  }
}
'

{
  "took" : 6,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.0,
    "hits" : [
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "1",
        "_score" : 0.0,
        "_source" : {
          "text" : "Saya suka makan nasi goreng."
        }
      }
    ]
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   572  100   466  100   106  42363   9636 --:--:-- --:--:-- --:--:-- 52000


In [None]:
%%bash

# (nasi and goreng and not tidak) or (suka)

curl -X GET 'localhost:9200/myindex/article/_search?pretty' -H 'Content-Type: application/json' -d '
{
  "query": {
    "bool": {
      "must": [
        {"term": {"text": "nasi"}},
        {"term": {"text": "goreng"}}
      ],
      "must_not": [
        {"term": {"text": "tidak"}}
      ],
      "should": [
        {"term": {"text": "suka"}}
      ]
    }
  }
}
'

{
  "took" : 4,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.0986491,
    "hits" : [
      {
        "_index" : "myindex",
        "_type" : "article",
        "_id" : "1",
        "_score" : 1.0986491,
        "_source" : {
          "text" : "Saya suka makan nasi goreng."
        }
      }
    ]
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   744  100   478  100   266  39833  22166 --:--:-- --:--:-- --:--:-- 62000


### Elasticsearch di Python

In [None]:
!pip3 install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-7.15.2-py2.py3-none-any.whl (379 kB)
[?25l[K     |▉                               | 10 kB 22.3 MB/s eta 0:00:01[K     |█▊                              | 20 kB 27.0 MB/s eta 0:00:01[K     |██▋                             | 30 kB 20.0 MB/s eta 0:00:01[K     |███▌                            | 40 kB 16.3 MB/s eta 0:00:01[K     |████▎                           | 51 kB 5.4 MB/s eta 0:00:01[K     |█████▏                          | 61 kB 5.8 MB/s eta 0:00:01[K     |██████                          | 71 kB 5.3 MB/s eta 0:00:01[K     |███████                         | 81 kB 5.9 MB/s eta 0:00:01[K     |███████▉                        | 92 kB 5.9 MB/s eta 0:00:01[K     |████████▋                       | 102 kB 5.2 MB/s eta 0:00:01[K     |█████████▌                      | 112 kB 5.2 MB/s eta 0:00:01[K     |██████████▍                     | 122 kB 5.2 MB/s eta 0:00:01[K     |███████████▎                    | 133 kB 5.2 MB

In [None]:
!pip3 install --upgrade inelastic

Collecting inelastic
  Downloading inelastic-0.2.4.tar.gz (9.4 kB)
Collecting elasticsearch==7.0.2
  Downloading elasticsearch-7.0.2-py2.py3-none-any.whl (83 kB)
[K     |████████████████████████████████| 83 kB 1.4 MB/s 
[?25hCollecting elasticsearch6==6.4.2
  Downloading elasticsearch6-6.4.2-py2.py3-none-any.whl (74 kB)
[K     |████████████████████████████████| 74 kB 3.2 MB/s 
[?25hCollecting tqdm==4.32.2
  Downloading tqdm-4.32.2-py2.py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.7 MB/s 
Building wheels for collected packages: inelastic
  Building wheel for inelastic (setup.py) ... [?25l[?25hdone
  Created wheel for inelastic: filename=inelastic-0.2.4-py3-none-any.whl size=8820 sha256=77db3d146c73364ad3c3b2117ce89a18add04d8bae97866fff641db32fd810f1
  Stored in directory: /root/.cache/pip/wheels/1f/d6/a3/b23eecda1f08c92a872c79b1f197503687ab94939fbb2821dd
Successfully built inelastic
Installing collected packages: tqdm, elasticsearch6, elasticsearch, i

### Inelastic

In [None]:
%%bash

# Digunakan untuk melihat tabel tf-idf yang dibuat oleh elasticsearch

inelastic -i myindex -f text | column -t -s ,

term    freq  doc_count  d0  d1
goreng  1     1          1
makan   2     2          1   2
nasi    2     2          1   2
padang  1     1          2
saya    2     2          1   2
suka    2     2          1   2
tidak   1     1          2


### Penggunaan elasticsearch di python

In [None]:
from elasticsearch import Elasticsearch

In [None]:
es = Elasticsearch()
document = '{"text": "nasi rames di seberang rasanya enak."}'
result = es.index(index="myindex", doc_type="article", id=3, body=document)
print(result['result'])

created


In [None]:
query = """ 
  {
    "query": {
      "match": {
        "text": "nasi goreng"
      }
    }
  }
"""

result = es.search(index="myindex", body=query)

print(json.dumps(result, indent=4))

{
    "took": 715,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 3,
            "relation": "eq"
        },
        "max_score": 1.1707046,
        "hits": [
            {
                "_index": "myindex",
                "_type": "article",
                "_id": "1",
                "_score": 1.1707046,
                "_source": {
                    "text": "Saya suka makan nasi goreng."
                }
            },
            {
                "_index": "myindex",
                "_type": "article",
                "_id": "2",
                "_score": 0.13039356,
                "_source": {
                    "text": "Saya tidak suka makan nasi padang"
                }
            },
            {
                "_index": "myindex",
                "_type": "article",
                "_id": "3",
                "_score": 0.1303

### Latihan

- download data news headlines berbahasa inggris english dataset https://www.kaggle.com/therohk/india-headlines-news-dataset
- baca data menggunakan pandas
- masukan 100 document dari kolom headline text ke dalam suatu index baru di dalam elasticsearch
- cek tf-idf table
- coba 5 jenis query untuk di-pose pada elasticsearch


In [None]:
# Download datasets

!gdown https://drive.google.com/uc?id=1fuX5lKbfOwFOqQfOlnga7GSvtM2lcjd0

Downloading...
From: https://drive.google.com/uc?id=1fuX5lKbfOwFOqQfOlnga7GSvtM2lcjd0
To: /content/india-news-headlines-1000.csv
  0% 0.00/57.5k [00:00<?, ?B/s]100% 57.5k/57.5k [00:00<00:00, 26.6MB/s]


In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
# Membaca dataset

english_df = pd.read_csv('india-news-headlines-1000.csv')
english_df.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [None]:
# Membuat instansiasi index baru bernama en_headlines

%%bash

curl -X PUT "localhost:9200/en_headlines"

{"acknowledged":true,"shards_acknowledged":true,"index":"en_headlines"}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100    71  100    71    0     0    270      0 --:--:-- --:--:-- --:--:--   269


In [None]:
# Secara iterative, memasukkan data headline text ke dalam index

for index, row in tqdm(english_df.iterrows()):
  headline_text = '{"text": "' +  row['headline_text'] + '"}'
  es.index(index="en_headlines", doc_type="headline", id=index, body=headline_text)

1000it [00:16, 58.87it/s]


In [None]:
%%bash

# Melihat tabel tf-idf yang dibuat oleh elasticsearch

inelastic -i en_headlines -f text | column -t -s ,

term             freq  doc_count  d0   d1   d2   d3   d4   d5   d6   d7   d8   d9   d10  d11  d12  d13  d14  d15  d16  d17  d18  d19  d20  d21  d22  d23  d24  d25  d26  d27  d28  d29  d30  d31  d32  d33  d34  d35  d36  d37  d38  d39  d40  d41  d42  d43  d44  d45  d46  d47  d48  d49  d50  d51  d52  d53  d54  d55  d56  d57  d58  d59  d60  d61  d62  d63  d64  d65  d66  d67  d68  d69  d70  d71  d72  d73  d74  d75  d76  d77  d78  d79  d80  d81  d82  d83  d84  d85  d86  d87  d88  d89  d90  d91  d92  d93  d94  d95  d96  d97  d98  d99  d100  d101  d102  d103  d104  d105  d106  d107  d108  d109  d110  d111  d112  d113  d114  d115  d116  d117  d118  d119  d120  d121  d122  d123  d124  d125  d126  d127  d128  d129  d130  d131  d132  d133  d134  d135  d136  d137  d138  d139  d140  d141  d142  d143  d144  d145  d146  d147  d148  d149  d150  d151  d152  d153  d154  d155  d156  d157  d158  d159  d160  d161  d162  d163  d164  d165  d166  d167  d168  d169  d170  d171  d172  d173  d174  d175  d176  d177

### Latihan

Lakukan query:
- match "mumbai"
- match frase "freedom fighter"
- boolean query "bomb AND bank"
- boolean query "minister AND NOT prime"
- boolean query "worker OR union"
- fuzzy query "polo" 

In [None]:
# match 'mumbai'

query = """
  {"query": {"match": {"text": "mumbai"}}}
"""

result = es.search(index="en_headlines", body=query)

print(json.dumps(result, indent=4))

{
    "took": 8,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 6,
            "relation": "eq"
        },
        "max_score": 5.1298814,
        "hits": [
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "90",
                "_score": 5.1298814,
                "_source": {
                    "text": "Stephen Hawking keeps date with Mumbai"
                }
            },
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "457",
                "_score": 4.810625,
                "_source": {
                    "text": "Mumbai firm to be sentenced in US"
                }
            },
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "843",
   

In [None]:
# match phrase 'freedom fighters'

query = """
  {
    "query": {
      "match_phrase":{
        "text": "freedom fighters'"
      }
    }
  }
"""

result = es.search(index="en_headlines", body=query)
print(json.dumps(result, indent=4))

{
    "took": 23,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 3,
            "relation": "eq"
        },
        "max_score": 10.563674,
        "hits": [
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "8",
                "_score": 10.563674,
                "_source": {
                    "text": "Move to stop freedom fighters' pension flayed"
                }
            },
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "34",
                "_score": 10.563674,
                "_source": {
                    "text": "Move to stop freedom fighters' pension flayed"
                }
            },
            {
                "_index": "en_headlines",
                "_type": "headline",
              

In [None]:
# boolean query: bomb AND bank

query = """
  {
    "query": {
      "bool": {
        "must": [
          {"term": {"text": "bomb"}},
          {"term": {"text": "bank"}}
        ]
      }
    }
  }
"""

result = es.search(index="en_headlines", body=query)
print(json.dumps(result, indent=4))

{
    "took": 5,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 1,
            "relation": "eq"
        },
        "max_score": 11.125051,
        "hits": [
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "62",
                "_score": 11.125051,
                "_source": {
                    "text": "Bomb hoax triggers panic in Canara Bank"
                }
            }
        ]
    }
}


In [None]:
# minister AND NOT prime

query = """
  {
    "query": {
      "bool": {
        "must": [
          {"term": {"text": "minister"}}
        ],
        "must_not": [
          {"term": {"text": "prime"}}
        ]
      }
    }
  }
"""

result = es.search(index="en_headlines", body=query)
print(json.dumps(result, indent=4))

{
    "took": 7,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 7,
            "relation": "eq"
        },
        "max_score": 5.3384237,
        "hits": [
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "944",
                "_score": 5.3384237,
                "_source": {
                    "text": "At home with the Minister"
                }
            },
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "256",
                "_score": 4.9841404,
                "_source": {
                    "text": "Civic polls: Minister refutes BJP charges"
                }
            },
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "865",
      

In [None]:
# worker OR union

query = """ 
{
  "query": {
    "bool": {
      "should": [
        {"term": {"text": "worker"}},
        {"term": {"text": "union"}}
      ]
    }
  }
}
"""
result = es.search(index="en_headlines", body=query)
print(json.dumps(result, indent=4))

{
    "took": 2,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 1,
            "relation": "eq"
        },
        "max_score": 6.6232715,
        "hits": [
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "699",
                "_score": 6.6232715,
                "_source": {
                    "text": "Sramik Union supports Baranagar mill violence"
                }
            }
        ]
    }
}


In [None]:
# fuzzy query

query = """ 
{
  "query": {
   "match": {
      "text": {
        "query": "polo",
        "fuzziness": "AUTO"
      }
    }
  }
}
"""
result = es.search(index="en_headlines", body=query)
print(json.dumps(result, indent=4))

{
    "took": 6,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 4,
            "relation": "eq"
        },
        "max_score": 5.8471775,
        "hits": [
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "233",
                "_score": 5.8471775,
                "_source": {
                    "text": "Ramgarh beat Royal Kashmir in Hanut Cup polo"
                }
            },
            {
                "_index": "en_headlines",
                "_type": "headline",
                "_id": "847",
                "_score": 4.3853836,
                "_source": {
                    "text": "New car pool portal to solve traffic chaos"
                }
            },
            {
                "_index": "en_headlines",
                "_type": "headline",
                

### Implementasi tokenisasi dan stemisasi di elasticsearch

- https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html
- https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stop-tokenfilter.html

In [None]:
%%bash

curl -X PUT 'localhost:9200/en_headlines_clean' -H 'Content-Type: application/json' -d '
{
  "settings": {
    "analysis": {
      "analyzer": {
        "default": {
          "tokenizer": "letter",
          "filter": ["stop", "stemmer"]
        }
      }
    }
  }
}'

{"acknowledged":true,"shards_acknowledged":true,"index":"en_headlines_clean"}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   180    0     0  100   180      0    895 --:--:-- --:--:-- --:--:--   891100   257  100    77  100   180    359    841 --:--:-- --:--:-- --:--:--  1195


In [None]:
for index, row in tqdm(english_df.iterrows()):
  headline_text = '{"text": "' +  row['headline_text'] + '"}'
  es.index(index="en_headlines_clean", doc_type="headline", id=index, body=headline_text)

1000it [00:10, 95.28it/s]


In [None]:
%%bash

inelastic -i en_headlines_clean -f text | column -t -s ,

term           freq  doc_count  d0   d1   d2   d3   d4   d5   d6   d7   d8   d9   d10  d11  d12  d13  d14  d15  d16  d17  d18  d19  d20  d21  d22  d23  d24  d25  d26  d27  d28  d29  d30  d31  d32  d33  d34  d35  d36  d37  d38  d39  d40  d41  d42  d43  d44  d45  d46  d47  d48  d49  d50  d51  d52  d53  d54  d55  d56  d57  d58  d59  d60  d61  d62  d63  d64  d65  d66  d67  d68  d69  d70  d71  d72  d73  d74  d75  d76  d77  d78  d79  d80  d81
A              20    20         121  130  131  138  161  167  180  181  188  238  263  342  386  58   734  759  769  782  813  880
ABVP           2     2          564  614
AI             1     1          850
AIDS           3     3          600  70   815
AIIMS          3     3          16   42   954
AP             3     3          103  21   47
ASI            1     1          217
ATR            1     1          907
Aamir          2     2          467  474
Abode          1     1          540
About          1     1          148
Abroad         1 

### Latihan untuk dataset Bahasa Indonesia

In [None]:
!gdown https://drive.google.com/uc?id=1vM6Skl6WDIcACRBgnkphcj3argPeqgnq

Downloading...
From: https://drive.google.com/uc?id=1vM6Skl6WDIcACRBgnkphcj3argPeqgnq
To: /content/indonesian-news-headlines.zip
100% 429M/429M [00:04<00:00, 92.1MB/s]


In [None]:
!unzip indonesian-news-headlines.zip

Archive:  indonesian-news-headlines.zip
  inflating: mq_news.csv             


In [None]:
# Ambil 1000 rows pertama

id_headlines_df = pd.read_csv('mq_news.csv', nrows=1000)
id_headlines_df.head()

Unnamed: 0.1,Unnamed: 0,item_id,url,title,content_html,published_at,source
0,0,5af9bdef421aa98c745d28cf,https://www.liputan6.com/news/read/2691369/tip...,Tips Agar Resolusi Tahun Baru Bisa Menjadi Aks...,"<div class=""article-content-body__item-page "" ...",2017-01-01 00:00:00,liputan6.com
1,1,5af9bdeb421aa98c745d28ce,https://www.liputan6.com/news/read/2692804/tra...,Tragedi Mendebarkan di Penghujung 2016,"<div class=""article-content-body__item-page "" ...",2017-01-01 00:01:00,liputan6.com
2,2,5af85c6b421aa922e201b8e2,http://megapolitan.kompas.com/read/2017/01/01/...,"Pantau Tahun Baru, Kapolda Metro Jaya Bonceng ...","<p><strong>JAKARTA, KOMPAS.com</strong> - Kapo...",2017-01-01 00:03:09,kompas.com
3,3,5af9bde7421aa98c745d28cd,https://www.liputan6.com/news/read/2693107/ter...,Terompet dan Kembang Api Bersahutan Sambut Tah...,"<div class=""article-content-body__item-page "" ...",2017-01-01 00:04:21,liputan6.com
4,4,5af9bde4421aa98c745d28cc,https://www.liputan6.com/news/read/2693110/awa...,Awas! Copet Berkeliaran di Ancol Malam Tahun B...,"<div class=""article-content-body__item-page "" ...",2017-01-01 00:05:08,liputan6.com


In [None]:
%%bash

# Membuat index baru bernama id_headlines_clean dg include tokenisasi, stopwords, dan stemming

curl -X PUT 'localhost:9200/id_headlines_clean' -H 'Content-Type: application/json' -d '
{
  "settings": {
    "analysis": {
      "analyzer": {
        "default": {
          "tokenizer": "standard",
          "filter": ["stopwords_id", "stemmer_id", "lowercase"]
        }
      },
      "filter": {
        "stopwords_id": {
          "type": "stop",
          "language": "_indonesian_"
        },
        "stemmer_id": {
          "type": "stemmer",
          "language": "indonesian"
        }
      }
    }
  }
}'

{"acknowledged":true,"shards_acknowledged":true,"index":"id_headlines_clean"}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   508  100    77  100   431    329   1841 --:--:-- --:--:-- --:--:--  2170


In [None]:
for index, row in tqdm(id_headlines_df.iterrows()):
  headline_text = '{"text": "' +  row['title'] + '"}'
  try:
    es.index(index="id_headlines_clean", doc_type="headline", id=index, body=headline_text)
  except:
    continue

1000it [00:10, 94.72it/s]


In [None]:
%%bash

inelastic -i id_headlines_clean -f text | column -t -s ,

term              freq  doc_count  d0   d1   d2   d3   d4   d5   d6   d7   d8   d9   d10  d11  d12  d13  d14  d15  d16  d17  d18  d19  d20  d21  d22  d23  d24  d25  d26  d27  d28  d29  d30  d31  d32  d33  d34  d35  d36  d37  d38  d39  d40  d41  d42  d43  d44  d45  d46  d47  d48  d49  d50  d51  d52  d53  d54  d55  d56  d57  d58  d59  d60  d61  d62  d63  d64  d65  d66  d67  d68  d69  d70  d71  d72  d73  d74  d75  d76  d77  d78  d79  d80  d81  d82  d83  d84  d85  d86  d87  d88  d89  d90  d91  d92  d93  d94  d95  d96  d97  d98  d99  d100  d101  d102  d103  d104  d105  d106  d107  d108  d109  d110  d111  d112  d113  d114  d115  d116  d117  d118  d119  d120  d121  d122  d123  d124  d125  d126  d127  d128  d129  d130  d131  d132  d133  d134  d135  d136  d137  d138  d139  d140  d141  d142  d143  d144  d145  d146  d147  d148  d149  d150  d151  d152  d153  d154  d155  d156  d157  d158  d159  d160  d161  d162  d163  d164  d165  d166  d167  d168  d169  d170  d171  d172  d173  d174  d175  d176  d17

### Latihan Query dari index `id_headlines_clean`

In [None]:
# Query untuk bahasa indonesia:
# - Query frase "tahun baru"

query = """
  {
    "query": {
      "match_phrase":{
        "text": "tahun baru'"
      }
    }
  }
"""

result = es.search(index="id_headlines_clean", body=query)
print(json.dumps(result, indent=4))

{
    "took": 17,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 124,
            "relation": "eq"
        },
        "max_score": 4.311252,
        "hits": [
            {
                "_index": "id_headlines_clean",
                "_type": "headline",
                "_id": "182",
                "_score": 4.311252,
                "_source": {
                    "text": "Warga dunia rayakan Tahun Baru 2017"
                }
            },
            {
                "_index": "id_headlines_clean",
                "_type": "headline",
                "_id": "334",
                "_score": 4.311252,
                "_source": {
                    "text": "Ahok Isi Tahun Baru dengan Ibadah"
                }
            },
            {
                "_index": "id_headlines_clean",
                "_type": "headline",
                

In [None]:
# Query untuk bahasa indonesia:
# - Boolean query "kembang AND NOT api"

query = """
  {
    "query": {
      "bool": {
        "must": [
          {"term": {"text": "kembang"}}
        ],
        "must_not": [
          {"term": {"text": "api"}}
        ]
      }
    }
  }
"""

result = es.search(index="id_headlines_clean", body=query)
print(json.dumps(result, indent=4))

{
    "took": 11,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 1,
            "relation": "eq"
        },
        "max_score": 3.3981433,
        "hits": [
            {
                "_index": "id_headlines_clean",
                "_type": "headline",
                "_id": "800",
                "_score": 3.3981433,
                "_source": {
                    "text": "Malaysia Kembangkan Pembatas Jalan Sistem Rol, untuk Minimalkan Korban Tewas Akibat Laka"
                }
            }
        ]
    }
}


In [None]:
# Query untuk bahasa indonesia:
# - Boolean query "api AND NOT kembang"

query = """
  {
    "query": {
      "bool": {
        "must": [
          {"term": {"text": "api"}}
        ],
        "must_not": [
          {"term": {"text": "kembang"}}
        ]
      }
    }
  }
"""

result = es.search(index="id_headlines_clean", body=query)
print(json.dumps(result, indent=4))

{
    "took": 3,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 10,
            "relation": "eq"
        },
        "max_score": 3.8502192,
        "hits": [
            {
                "_index": "id_headlines_clean",
                "_type": "headline",
                "_id": "384",
                "_score": 3.8502192,
                "_source": {
                    "text": "Api Diduga Berasal dari Bagian Mesin Kapal"
                }
            },
            {
                "_index": "id_headlines_clean",
                "_type": "headline",
                "_id": "604",
                "_score": 3.8502192,
                "_source": {
                    "text": "Kepanikan Saat Api Melalap Kapal Zahro Express"
                }
            },
            {
                "_index": "id_headlines_clean",
                "_type": "headli