# ElasticSearch Data Loading

In [1]:
import json, time, os
from elasticsearch import Elasticsearch

### ES Index
Build ES data index from JSONS

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [3]:
settings = {
    
    "test_text":{
        "mappings":{
            "dynamic":"strict",
            "properties":{
                "author":{"type":"text"},
                "content":{"type":"text"},
                "date":{"type":"text"},
                "id":{"type":"text"},
                "publication":{"type":"text"},
                "title":{"type":"text"}
                }
            }
        }
    }

In [4]:
index_name = "corpus1.1" # Name of the index

In [5]:
es.create(index=index_name, id=5, body=settings)

{'_index': 'corpus1.1',
 '_type': '_doc',
 '_id': '5',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [6]:
os.chdir("C:/Users/hkhan/OneDrive - LMI/Desktop/Workspace/epic/all-the-news-json")

In [7]:
## Running through nested json files and indexing each on elastic

for i1, file in enumerate(os.listdir()):
    
    i1+=1
    
    with open(file) as json_file:

        f = json.load(json_file)

        for i2, j in enumerate(f):
            es.index(index=index_name, id=int(f"{i1}{i2}"), body=j)
            print("Progress: {:2.2%}".format(round(((i2/len(f))), 3)), end="\r")
            
    

Progress: 100.00%

### ES Snapshots
Create and restore ES snapshots

In [None]:
## Close ES index

es.indices.close(index='corpus_one_doc')

In [None]:
## Create repository

repository_body = {
    "type": "fs",
    "settings": {
            "location": "C:/Users/hkhan/OneDrive - LMI/Desktop/Workspace/epic/es-backup"
    }
}

es.snapshot.create_repository(repository='corpus_repository', body=repository_body)

In [None]:
## Create snapshot in repository

snapshot_body = {
  "indices": "corpus_one_doc",
  "ignore_unavailable": True,
  "include_global_state": False,
  "metadata": {
    "taken_by": "Hasan",
    "taken_because": "Creating snapshot for git repo"
  }
}

es.snapshot.create(repository='corpus_repository', snapshot='corpus_snapshot', body=snapshot_body)

In [None]:
## Retrive snapshot


restore_body = {
  "indices": "corpus_one_doc",
  "ignore_unavailable": True,
  "include_global_state": True,
  "rename_pattern": "index_(.+)",
  "rename_replacement": "restored_index_$1"
}

es.snapshot.restore(repository='corpus1_repository', snapshot='corpus1_snapshot', body=restore_body)
