# Librerías

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import json
import pandas as pd

# Conexión con elastic

In [2]:
elastic = Elasticsearch(["http://127.0.0.1:9200"])

In [3]:
!curl -XGET "0.0.0.0:9200"    # comprobando conexión

{
  "name" : "Lenovo",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "ZW_929WhR9CoGgJdthcJcA",
  "version" : {
    "number" : "7.7.0",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "81a1e9eda8e6183f5237786246f6dced26a10eaf",
    "build_date" : "2020-05-12T02:01:37.602180Z",
    "build_snapshot" : false,
    "lucene_version" : "8.5.1",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


# Creando un nuevo índice

## 1. Carga de datos en Python

In [4]:
path = "../data/cncf_git_data.json"

data = []
for line in open(path, 'r'):
    data.append(json.loads(line))

In [5]:
print(data[0])

{'_index': 'git_cncf_190802_enriched_191007', '_type': 'items', '_id': '91d824e7ddb3a16bbe0e229fa62cc88a7415c3e2', '_score': 1, '_source': {'Author_gender': 'Unknown', 'demography_min_date': '2018-12-28T11:53:52.000Z', 'metadata__gelk_backend_name': 'GitEnrich', 'tz': -2, 'project': 'Graduated', 'metadata__timestamp': '2019-08-02T14:22:43.155789+00:00', 'uuid': '91d824e7ddb3a16bbe0e229fa62cc88a7415c3e2', 'Author_user_name': 'Unknown', 'cm_title': 'Graduated', 'Commit_id': '183f4592bb7b37d6053a6a9ce9980c599c766ba2', 'Commit_user_name': 'Unknown', 'metadata__enriched_on': '2019-10-07T19:41:38.181669+00:00', 'author_date': '2018-12-28T09:53:52', 'Author_domain': 'loadsmart.com', 'tag': 'https://github.com/kubernetes/minikube.git', 'author_bot': False, 'message_analyzed': 'Adding ReportError test and HTTP Test server reuse', 'author_org_name': 'Unknown', 'hash_short': 'f948bf', 'repository_labels': [], 'commit_date': '2018-12-28T10:04:45', 'Author_org_name': 'Unknown', 'project_1': 'Gradua

## 2. Carga de datos en Elastic

En versiones anteriores a Elasticssearch 7.0, un **index** podía tener más de un **type** pero en versiones 7.0 y superiores solo puede haber un **type** por **index**. Esta operacion puede demorarse mucho ya que la base de datos es muy grande.

In [11]:
index_name = 'cncf_dataset'

print('Starting indexing...', '\n')
for idx, doc in enumerate(data):
    elastic.index(index = index_name, id = doc['_id'], body = doc['_source'])
    if idx%50000 == 0:
        print('Processed {} documents, {} remaining'.format(idx, len(data)-idx))
print('\n', 'Processing done')

Starting indexing... 

Processed 0 documents, 491703 remaining


KeyboardInterrupt: 

# Acciones con índices

## 1. Mostrando índices creados

Si queréis ver los índices que hay en vustro nodo de elastic podéis ejecutar la siguiente linea de código.

In [12]:
indexes = elastic.indices.get_alias("*").keys()
[print(i) for i in indexes];

company
cncf_dataset
.kibana_1
my_first_index
.apm-custom-link
.apm-agent-configuration
.kibana_task_manager_1


## 2. Creando y borrando índices

Por otro lado puede seros interesante aprender a borrar un índice previamente creado si el proceso de carga se ha quedado a medias por falta de memoria por ejemplo.

In [16]:
new_index = "my_first_index"    # poner siempre en minúsculas
elastic.indices.create(index = new_index)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_first_index'}

In [17]:
!curl -XGET "0.0.0.0:9200/my_first_index"    # comprobando creación del nuevo índice

{"my_first_index":{"aliases":{},"mappings":{},"settings":{"index":{"creation_date":"1592950794822","number_of_shards":"1","number_of_replicas":"1","uuid":"vZgWEmXVRUaMN31IsXGxmw","version":{"created":"7070099"},"provided_name":"my_first_index"}}}}

In [18]:
elastic.indices.delete(index = new_index)

{'acknowledged': True}

# Exportar de Mapping

In [54]:
mapping = elastic.indices.get_mapping('cncf_dataset') # extraemos el mapping

In [55]:
mapping_keys = mapping[index_name]["mappings"].keys()
doc_type = list(mapping_keys)[0]
schema = mapping['cncf_dataset']['mappings']['properties']
print(json.dumps(schema, indent=4))

# guardado de mapping
with open('../data/schema.json', 'w') as outfile:
    json.dump(schema, outfile)

{
    "Author_bot": {
        "type": "boolean"
    },
    "Author_domain": {
        "type": "text",
        "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
        }
    },
    "Author_gender": {
        "type": "text",
        "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
        }
    },
    "Author_gender_acc": {
        "type": "long"
    },
    "Author_id": {
        "type": "text",
        "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
        }
    },
    "Author_multi_org_names": {
        "type": "text",
        "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
        }
    },
    "Author_name": {
        "type": "text",
        "fields": {
            "keyword": {
                "

In [57]:
template = elastic.indices.get_template('cncf_dataset')

NotFoundError: NotFoundError(404, '{}')