# https://rebeccabilbro.github.io/intro-doc-similarity-with-elasticsearch/

In [None]:
import os

from sklearn.datasets.base import Bunch
from yellowbrick.download import download_all

## The path to the test data sets
FIXTURES  = os.path.join(os.getcwd(), "data")

## Dataset loading mechanisms
datasets = {
    "hobbies": os.path.join(FIXTURES, "hobbies")
}

In [None]:
def load_data(name, download=True):
    """
    Loads and wrangles the passed in text corpus by name.
    If download is specified, this method will download any missing files. 
    """
    
    # Get the path from the datasets 
    path = datasets[name]
    
    # Check if the data exists, otherwise download or raise 
    if not os.path.exists(path):
        if download:
            download_all() 
        else:
            raise ValueError((
                "'{}' dataset has not been downloaded, "
                "use the download.py module to fetch datasets"
            ).format(name))
    
    # Read the directories in the directory as the categories. 
    categories = [
        cat for cat in os.listdir(path) 
        if os.path.isdir(os.path.join(path, cat))
    ]
    
    files  = [] # holds the file names relative to the root 
    data   = [] # holds the text read from the file 
    target = [] # holds the string of the category 
        
    # Load the data from the files in the corpus 
    for cat in categories:
        for name in os.listdir(os.path.join(path, cat)):
            files.append(os.path.join(path, cat, name))
            target.append(cat)
            
            with open(os.path.join(path, cat, name), 'r') as f:
                data.append(f.read())
        
    
    # Return the data bunch for use similar to the newsgroups example
    return Bunch(
        categories=categories,
        files=files,
        data=data,
        target=target,
    )


In [None]:
corpus = load_data('hobbies')

In [None]:
nb = 2
for k in corpus.keys():
    print(k,len(corpus[k]))
    [print('\t+ '+str((i,e))) for i,e in enumerate(corpus[k]) if i < nb]
    if len(corpus[k]) > nb:
        print('\t+ ', '...')
        [print('\t+ '+str((i,e))) for i,e in enumerate(corpus[k]) if i >= len(corpus[k])-nb]
    print()

In [None]:
corpus['categories']

In [None]:
hobby_types  = {}

for category in corpus.categories:
    texts = []
    for idx in range(len(corpus.data)):
        if corpus['target'][idx] == category:
            texts.append(' '.join(corpus.data[idx].split()))
    hobby_types[category] = texts

In [None]:
food_stories = [text for text in hobby_types['cooking']]
print(food_stories[5])
print(food_stories[23])

In [None]:
from elasticsearch.helpers import bulk
from elasticsearch import Elasticsearch

class ElasticIndexer(object):
    """
    Create an ElasticSearch instance, and given a list of documents, 
    index the documents into ElasticSearch.
    """
    def __init__(self):
        self.elastic_search = Elasticsearch()
        
    def make_documents(self, textdict):
        """
        A textdict is a dictionary of documents where each key corresponds
        to a document category and each value is a list of documents
        """
        for category, docs in textdict:
            for document in docs:
                yield {
                    "_index": category,
                    "_type": "_doc",
                    "description": document
                }        
                
    def index(self, textdict):
        bulk(self.elastic_search, self.make_documents(textdict))

In [None]:
indexer = ElasticIndexer()
indexer.index(hobby_types.items())

In [None]:
from pprint import pprint

query = {"match_all": {}}
result = indexer.elastic_search.search(index="cooking", body={"query":query})

print(f"{result['hits']['total']['value']} hits \n")
print("First result:\n")
pprint(result['hits']['hits'][0])


In [None]:
query = {"fuzzy":{"description":"breakfast"}}
result = indexer.elastic_search.search(index="cooking", body={"query":query})

print(f"{result['hits']['total']['value']} hits \n")
print("First result:\n")
pprint(result['hits']['hits'][0])
