# Use Elasticsearch with magi_dataset
References:
- [Elasticsearch-python](https://elasticsearch-py.readthedocs.io/en/v8.6.2/#)
- [magi_dataset](https://github.com/Enoch2090/magi_dataset)
## Preparations

In [None]:
# Install dependencies
!pip3 install magi_dataset elasticsearch terminaltables

Start Elasticsearch with Docker:
```bash
docker network create elastic
sudo sysctl -w vm.max_map_count=262144  # run this when using WSL2 Docker
docker run --name es01 --net elastic -p 9200:9200 -it docker.elastic.co/elasticsearch/elasticsearch:8.6.2
```

Copy certificate to current directory:
```bash
docker cp es01:/usr/share/elasticsearch/config/certs/http_ca.crt .
```

Testing Elasticsearch:
```bash
curl --cacert http_ca.crt -u elastic https://localhost:9200
```

In [None]:
# Paste password copied from the previous step here
ELASTIC_PASSWORD = 'H4SYHVz0tY7RuAaqXV4a'

In [175]:
# Imports & inits
from magi_dataset import *
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from dataclasses import asdict
from tqdm.auto import tqdm
from terminaltables import AsciiTable

es = Elasticsearch(
    'https://localhost:9200', 
    ca_certs =  './http_ca.crt',
    basic_auth = ('elastic', ELASTIC_PASSWORD)
)
es.info().body

ObjectApiResponse({'name': '2abb07847195', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'h5XcXgu-S_6-e9a6XuJKqw', 'version': {'number': '8.6.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '2d58d0f136141f03239816a4e360a8d17b6d8f29', 'build_date': '2023-02-13T09:35:20.314882762Z', 'build_snapshot': False, 'lucene_version': '9.4.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [None]:
# Defining helper functions
def upload_to_es(es_instance, data, index:str, batch_size=1000):
    bulk_data = []
    for i, repo in enumerate(tqdm(data)):
        bulk_data.append(
            {
                '_index': index,
                '_id': i,
                '_source': asdict(repo)
            }
        )
        if (i + 1) % batch_size == 0:
            bulk(es_instance, bulk_data)
            bulk_data = []
    bulk(es_instance, bulk_data)
    es_instance.indices.refresh(index=index)
    return es_instance.cat.count(index=index, format='json')   

def print_query(response, additonal_fields=[]):
    table_data = [
        ['name', 'lang', 'link', 'description', 'score', *additonal_fields]
    ]
    for x in response['hits']['hits']:
        table_data.append(
            [x['_source']['name'], x['_source']['lang'], x['_source']['link'], x['_source']['description'][:100], x['_score'], *[x[field] for field in additonal_fields]]
        )
    table = AsciiTable(table_data)
    print(table.table)
        
def print_sql(response):
    table_data = [
        [x['name'] for x in response['columns']],
        *response['rows']
    ]
    table = AsciiTable(table_data)
    print(table.table)
    

In [25]:
# for lang in ['Python', 'C++']:
#     lang_safe = lang.lower().replace('++', 'pp')
#     es.options(ignore_status=400).indices.delete(index=f'{lang_safe}-index')

In [196]:
# GitHubDataset data shape
data = GitHubDataset(empty=False, file_path=f'rust-latest')
len(data)

1543

In [199]:
data[0].__dict__

{'name': 'denoland/deno',
 'link': 'https://github.com/denoland/deno',
 'tags': ['deno', 'typescript', 'javascript', 'rust'],
 'stars': 88041,
 'description': 'A modern runtime for JavaScript and TypeScript.',
 'lang': 'Rust',
 'repo_lang': '',
 'readme': '# Deno\n\n[![Build Status - Cirrus][]][Build status] [![Twitter handle][]][Twitter badge]\n[![Discord Chat](https://img.shields.io/discord/684898665143206084?logo=discord&style=social)](https://discord.gg/deno)\n\n<img align="right" src="https://deno.land/logo.svg" height="150px" alt="the deno mascot dinosaur standing in the rain">\n\nDeno is a _simple_, _modern_ and _secure_ runtime for **JavaScript** and\n**TypeScript** that uses V8 and is built in Rust.\n\n### Features\n\n- Secure by default. No file, network, or environment access, unless explicitly\n  enabled.\n- Supports TypeScript out of the box.\n- Ships only a single executable file.\n- [Built-in utilities.](https://deno.land/manual/tools#built-in-tooling)\n- Set of reviewed

In [176]:
# Create indices and upload data
for lang in ['Python', 'C++', 'JavaScript']:
    lang_safe = lang.lower().replace('++', 'pp')
    es.options(ignore_status=400).indices.create(index=f'{lang_safe}-index')
    data = GitHubDataset(empty=False, file_path=f'{lang_safe}-latest')
    print(
        upload_to_es(
            es, 
            data, 
            index = f'{lang_safe}-index', 
            batch_size = 3000
        )
    )

100%|██████████| 9720/9720 [00:14<00:00, 669.50it/s]


[{'epoch': '1679871016', 'timestamp': '22:50:16', 'count': '9720'}]


100%|██████████| 3788/3788 [00:05<00:00, 638.05it/s]


[{'epoch': '1679871027', 'timestamp': '22:50:27', 'count': '3788'}]


100%|██████████| 10976/10976 [00:13<00:00, 815.42it/s]


[{'epoch': '1679871051', 'timestamp': '22:50:51', 'count': '10976'}]


In [202]:
# Retrieval by ID
resp = es.get(index = 'python-index', id = 99)
resp.body

{'_index': 'python-index',
 '_id': '99',
 '_version': 3,
 '_seq_no': 19539,
 '_primary_term': 1,
 'found': True,
 '_source': {'name': 'Textualize/textual',
  'link': 'https://github.com/Textualize/textual',
  'tags': ['terminal', 'python', 'tui', 'rich', 'cli', 'framework'],
  'stars': 17744,
  'description': 'Textual is a TUI (Text User Interface) framework for Python inspired by modern web development.',
  'lang': 'Python',
  'repo_lang': '',
  'readme': '# Textual\n\n![Textual splash image](https://raw.githubusercontent.com/Textualize/textual/main/imgs/textual.png)\n\nTextual is a *Rapid Application Development* framework for Python.\n\nBuild sophisticated user interfaces with a simple Python API. Run your apps in the terminal and (coming soon) a web browser!\n\n<details>  \n  <summary> 🎬 Demonstration </summary>\n  <hr>\n  \nA quick run through of some Textual features.\n  \n\n\nhttps://user-images.githubusercontent.com/554369/197355913-65d3c125-493d-4c05-a590-5311f16c40ff.mov\n\n\

In [139]:
# Simple query
resp = es.search(
    index = 'python-index',
    query = {
        'match' : {
            'readme' : 'web archiving service'
        }
    }
)
print_query(resp)

+--------------------------------+--------+---------------------------------------------------+------------------------------------------------------------------------------------------------------+------------+
| name                           | lang   | link                                              | description                                                                                          | score      |
+--------------------------------+--------+---------------------------------------------------+------------------------------------------------------------------------------------------------------+------------+
| internetarchive/brozzler       | Python | https://github.com/internetarchive/brozzler       | brozzler - distributed browser-based web crawler                                                     | 16.555544  |
| ArchiveBox/ArchiveBox          | Python | https://github.com/ArchiveBox/ArchiveBox          | 🗃 Open source self-hosted web archiving. Takes URLs/brow

In [100]:
# Query over indices
resp = es.search(
    index = '*-index',
    query = {
        'match' : {
            'readme' : 'web archiving service'
        }
    }
)
print_query(resp)

+--------------------------+------------+---------------------------------------------+------------------------------------------------------------------------------------------------------+------------+
| name                     | lang       | link                                        | description                                                                                          | score      |
+--------------------------+------------+---------------------------------------------+------------------------------------------------------------------------------------------------------+------------+
| internetarchive/brozzler | Python     | https://github.com/internetarchive/brozzler | brozzler - distributed browser-based web crawler                                                     | 16.555544  |
| ArchiveBox/ArchiveBox    | Python     | https://github.com/ArchiveBox/ArchiveBox    | 🗃 Open source self-hosted web archiving. Takes URLs/browser history/bookmarks/Pocket/Pinboard/et

In [208]:
# Index boosting
resp = es.search(
    index = '*-index',
    query = {
        'match' : {
            'readme' : 'web archiving service'
        }
    },
    indices_boost = [
        {
            'javascript-index': 1.5
        },
        {
            'cpp-index': 1.3
        }
    ]
)
print_query(resp)

+-----------------------------+------------+------------------------------------------------+------------------------------------------------------------------------------------------------------+-----------+
| name                        | lang       | link                                           | description                                                                                          | score     |
+-----------------------------+------------+------------------------------------------------+------------------------------------------------------------------------------------------------------+-----------+
| webrecorder/pywb            | JavaScript | https://github.com/webrecorder/pywb            | Core Python Web Archiving Toolkit for replay and recording of web archives                           | 20.327332 |
| croqaz/clean-mark           | JavaScript | https://github.com/croqaz/clean-mark           | Convert an article into clean text                                    

In [174]:
# More complex query
resp = es.search(
    index = 'python-index',
    query = {
        'multi_match': {
            'query': 'find slowest part of program',
            'fields': [
                'readme^3',
                'description^5',
                'hn_comments^10'
            ],
            'type': 'best_fields'
        }
    }
)
print('hn_comments > description > readme')
print_query(resp)
print()

# Rerun, but with different weights
resp = es.search(
    index = 'python-index',
    query = {
        'multi_match': {
            'query': 'find slowest part of program',
            'fields': [
                'readme^10',
                'description^3',
                'hn_comments^5'
            ],
            'type': 'best_fields'
        }
    }
)
print('readme > hn_comments > description')
print_query(resp)

hn_comments > description > readme
+-------------------------------+--------+--------------------------------------------------+------------------------------------------------------------------------------------------------------+-----------+
| name                          | lang   | link                                             | description                                                                                          | score     |
+-------------------------------+--------+--------------------------------------------------+------------------------------------------------------------------------------------------------------+-----------+
| joerick/pyinstrument          | Python | https://github.com/joerick/pyinstrument          | 🚴 Call stack profiler for Python. Shows you why your code is slow!                                  | 120.22317 |
| closeio/ciso8601              | Python | https://github.com/closeio/ciso8601              | Fast ISO8601 date time parser for Py

In [140]:
# Highlighting
resp = es.search(
    index = '*-index',
    query = {
        'match' : {
            'readme' : 'web archiving service'
        }
    },
    highlight = {
        'fields': {
            'readme':{}
        }
    }
)
print_query(resp, additonal_fields=['highlight'])

+--------------------------+------------+---------------------------------------------+------------------------------------------------------------------------------------------------------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| name                     | lang       | link                                        | description                                            

In [92]:
# SQL
resp = es.sql.query(
    query = '''
    SELECT "name", "lang", "link", "description" FROM "python-index" WHERE "name" LIKE '%archive%' HAVING "stars" > 1000
    '''
)
print_sql(resp)

+-------------------------------------------+--------+--------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| name                                      | lang   | link                                                         | description                                                                                                                                                                                                                               |
+-------------------------------------------+--------+--------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [144]:
# Use SQL to find insights
for lang in ['Python', 'C++', 'JavaScript']:
    lang_safe = lang.lower().replace('++', 'pp')
    resp = es.sql.query(
        query = '''
        SELECT COUNT("name") FROM "{}-index" WHERE "readme_type"='rst'
        '''.format(lang_safe)
    )
    print(lang)
    print_sql(resp)
    print()

Python
+---------------+
| COUNT("name") |
+---------------+
| 1480          |
+---------------+

C++
+---------------+
| COUNT("name") |
+---------------+
| 68            |
+---------------+

JavaScript
+---------------+
| COUNT("name") |
+---------------+
| 45            |
+---------------+



In [211]:
SEARCH_QUERY = 'terminal print color and rich text'
resp = es.search(
    index = 'python-index',
    query = {
        'multi_match': {
            'query': SEARCH_QUERY,
            'fields': [
                'readme^3',
                'description^5',
                'hn_comments^10'
            ],
            'type': 'best_fields'
        }
    }
)
print_query(resp)

+-----------------------------+--------+------------------------------------------------+------------------------------------------------------------------------------------------------------+-----------+
| name                        | lang   | link                                           | description                                                                                          | score     |
+-----------------------------+--------+------------------------------------------------+------------------------------------------------------------------------------------------------------+-----------+
| Textualize/rich             | Python | https://github.com/Textualize/rich             | Rich is a Python library for rich text and beautiful formatting in the terminal.                     | 184.1525  |
| colour-science/colour       | Python | https://github.com/colour-science/colour       | Colour Science for Python                                                                 