In [18]:
from elasticsearch import Elasticsearch
import os
from pathlib import Path
import pickle
import json

## Elastic Search Indexer

In [19]:
class ElasticIndexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        print(self.crawled_folder  / 'url_list.pickle')
        with open(self.crawled_folder / 'url_list.pickle', 'rb') as f:
            self.file_mapper = pickle.load(f) 
        self.es_client = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                    ca_certs="./http_ca.crt")
        # self.es_client.info()

    def run_indexer(self):
        self.es_client.options(ignore_status=400).indices.create(index='simple')
        self.es_client.options(ignore_status=[400,404]).indices.delete(index='simple')
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                j['id'] = j['url']
                # print(j)
                self.es_client.index(index='simple', document=j)


In [21]:
s = ElasticIndexer()
s.run_indexer()
query = {"match": { "text": "WIL"}} 
results = s.es_client.search(index='simple', query=query)
print("Got %d Hits :" % results['hits']['total']['value'])
for hit in results['hits']['hits']:
    print("The title is {0} ({1})".format(hit['_source']['title'], hit['_source']['url']))   

c:\SoftwareEngineer\Work\ir\crawled\url_list.pickle
Got 155 Hits :
The title is ศูนย์ WIL จัดกิจกรรมแนะนำตำแหน่งงานสำหรับสหกิจศึกษา (https://camt.cmu.ac.th/index.php/en/all-news-groups/24-ข่าวทั่วไป/1149-ศูนย์-wil-จัดกิจกรรมแนะนำตำแหน่งงานสำหรับสหกิจศึกษา.html)
The title is Sign In with CMU Account (https://oauth.cmu.ac.th/v1/Authorize.aspx?response_type=code&client_id=3pefrhnrAcsu4VSAexA4XW98a3d9cf4tE7a2QD09&redirect_uri=https://service.camt.cmu.ac.th/wilstu/home/callback&scope=cmuitaccount.basicinfo&state=xyx)
The title is ข่าวทั่วไป (https://camt.cmu.ac.th/index.php/th/หัวข้อกลุ่มข่าวทั้งหมด/24-ข่าวทั่วไป.html)
The title is หน้าหลัก (https://www.camt.cmu.ac.th/)
The title is หน้าหลัก (https://camt.cmu.ac.th/index.php/th/)
The title is ประกาศ (https://camt.cmu.ac.th/index.php/th/2-uncategorised/324-ประกาศ.html)
The title is หน้าหลัก (https://camt.cmu.ac.th/index.php/en/?p=&lang=th)
The title is หน้าหลัก (https://www.camt.cmu.ac.th)
The title is หน้าหลัก (https://camt.cmu.ac.th/)
The 

## Elasticsearch API

In [None]:
import time
from flask import Flask , request
import pandas as pd

app = Flask(__name__)
app.es_client  = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                    ca_certs="./http_ca.crt")

@app.route('/search_es', methods=["GET"])
def search_es():
    start = time.time()
    res = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.es_client.search(index='simple', source_excludes=['url_lists'], size=100, 
                                   query={"match": {"text": query_term}})
    end = time.time()
    total_hit = results['hits']['total']['value']
    results_df = pd.DataFrame([[hit["_source"]['title'], hit["_source"]['url'], hit["_source"]
['text'][:100], hit["_score"]] for hit in results['hits']['hits']], columns=['title', 'url', 'text', 
'score'])
    
    res['total_hit'] = total_hit
    res['results'] = results_df.to_dict('records')
    res['elapse'] = end-start

    return res

### Quick exercise

In [None]:
from ManualIndexer import Indexer, preProcessor

app = Flask(__name__)
app.es_client  = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                    ca_certs="./http_ca.crt")
app.indexer = Indexer()
app.indexer.run_indexer()

@app.route('/manual_index', methods=["GET"])
def manual_index():
    
    start = time.time()
    res = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.indexer.search(query_term)
    end = time.time()
    total_hit = len(results)
    
    res['total_hit'] = total_hit
    res['results'] = results.sort_values("score", ascending=False).drop("url_lists", axis=1).head(100).to_dict('records')
    res['elapse'] = end-start

    return res

In [None]:
app.run(debug=False)