# I. Managing indices, mappings, analysis, ranking

## 1. Create index

In [None]:
import elasticsearch

es = elasticsearch.Elasticsearch()

index_name = "codegrepper"
type_name = "answer"

if es.indices.exists(index_name):
    es.indices.delete(index_name)

es.indices.create(index_name)
es.cluster.health(wait_for_status="yellow")

## 2. Analysys

In [None]:
es.indices.close(index=index_name)

es.indices.put_settings(
    index=index_name,
    body={
        "analysis": {
            "filter": {},
            "analyzer": {
                "keyword_analyzer": {
                    "filter": [
                        "lowercase",
                        "asciifolding",
                        "trim"
                    ],
                    "char_filter": [],
                    "type": "custom",
                    "tokenizer": "keyword"
                },
                "edge_ngram_analyzer": {
                    "filter": [
                        "lowercase"
                    ],
                    "tokenizer": "edge_ngram_tokenizer"
                },
                "edge_ngram_search_analyzer": {
                    "tokenizer": "lowercase"
                }
            },
            "tokenizer": {
                "edge_ngram_tokenizer": {
                    "type": "edge_ngram",
                    "min_gram": 2,
                    "max_gram": 5,
                    "token_chars": [
                        "letter"
                    ]
                }
            }
        }
    })

es.indices.open(index=index_name)

## 3. Mapping (explicit)

In [None]:
es.indices.put_mapping(
    index=index_name,
    doc_type=type_name,
    include_type_name=True,
    body={
        type_name: {
            "properties": {
                "title": {
                    "type": "text",
                    "term_vector": "with_positions_offsets",
                    "fields": {
                        "keywordstring": {
                            "type": "text",
                            "analyzer": "keyword_analyzer"
                        },
                        "edgengram": {
                            "type": "text",
                            "analyzer": "edge_ngram_analyzer",
                            "search_analyzer": "edge_ngram_search_analyzer"
                        },
                        "completion": {
                            "type": "completion",
                            "contexts": [
                                {
                                    "name": "framework",
                                    "type": "category",
                                    "path": "tags"
                                },
                            ]
                        }
                    },
                    "analyzer": "standard"
                },
                "code": {
                    "type": "text",
                    "term_vector": "with_positions_offsets",
                },
                "date": {
                    "type": "date",
                    "format": "MMM dd yyyy"},
                "vote": {
                    "type": "integer",
                },
                "tags": {"type": "keyword"},
                "posted_by": {
                    "type": "text",
                    "term_vector": "with_positions_offsets",
                },
                "source_name": {
                    "type": "text",
                    "term_vector": "with_positions_offsets",
                },
                "source_url": {
                    "type": "text",
                    "term_vector": "with_positions_offsets",
                },
            }
        }
    }
)

## 4. Bulk insert scraped data

In [None]:
import json

from elasticsearch.helpers import bulk


def read_data(file):
    with open(file, 'r') as f:
        jobs = json.loads(f.read())
        res_list = [i for n, i in enumerate(jobs) if i not in jobs[n + 1:]]
        return res_list


def gen_data(job_list):
    for job in job_list:
        title = job.get("title", None)
        code = job.get("code", None)
        date = job.get("date", None)
        vote = job.get("vote", None)
        tags = job.get("tags", None)
        posted_by = job.get("posted_by", None)
        source_name = job.get("source_name", None)
        source_url = job.get("source_url", None)

        yield {
            "_index": index_name,
            "_source": {
                "title": title,
                "code": code,
                "date": date,
                "vote": vote,
                "tags": tags,
                "posted_by": posted_by,
                "source_name": source_name,
                "source_url": source_url,
            },
        }


import os
for root, dirs, files in os.walk('./data', topdown=False):
        for name in files:
            file = os.path.join(root, name)
            data = read_data(file)
            bulk(es, gen_data(data))
