In [13]:
from elasticsearch import Elasticsearch
import os
from pathlib import Path
import pickle
import json

## Elastic Search Indexer

In [14]:
class ElasticIndexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        print(self.crawled_folder  / 'url_list.pickle')
        with open(self.crawled_folder / 'url_list.pickle', 'rb') as f:
            self.file_mapper = pickle.load(f) 
        self.es_client = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                    ca_certs="./http_ca.crt")
        # self.es_client.info()

    def run_indexer(self):
        self.es_client.options(ignore_status=400).indices.create(index='simple')
        self.es_client.options(ignore_status=[400,404]).indices.delete(index='simple')
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                j['id'] = j['url']
                # print(j)
                self.es_client.index(index='simple', document=j)


In [15]:
s = ElasticIndexer()
s.run_indexer()
query = {"match": { "text": "school"}} 
results = s.es_client.search(index='simple', query=query)
print("Got %d Hits :" % results['hits']['total']['value'])
for hit in results['hits']['hits']:
    print("The title is {0} ({1})".format(hit['_source']['title'], hit['_source']['url']))   

c:\SoftwareEngineer\Work\ir\crawled\url_list.pickle
Got 19 Hits :
The title is Gifted School 2020 (https://go.camt.cmu.ac.th/index.php/th/2019-05-16-09-02-18/2019-05-16-09-05-06)
The title is Graduate School, Chiang Mai University (https://www.grad.cmu.ac.th/index.php?lang=en)
The title is Gift School 2023 (https://service.camt.cmu.ac.th/gifted)
The title is การจัดการความรู้และนวัตกรรม ป.เอก (https://go.camt.cmu.ac.th/index.php/th/major/graduate/doctoral-km)
The title is CMU-IPAS (https://www1.reg.cmu.ac.th/reg-ipas/main/index.php)
The title is วิศวกรรมซอฟต์แวร์ ป.โท (https://go.camt.cmu.ac.th/index.php/th/major/graduate/graduate-se)
The title is วิทยาลัยศิลปะ สื่อ และเทคโนโลยี (http://go.camt.cmu.ac.th)
The title is การจัดการความรู้และนวัตกรรม ป.โท (https://go.camt.cmu.ac.th/index.php/th/major/graduate/graduate-km)
The title is PRE COLLEGE 2020 (https://go.camt.cmu.ac.th/index.php/th/2019-05-16-09-02-18/2019-05-16-09-05-7)
The title is Young Mobile Dev (https://go.camt.cmu.ac.th/index

## Elasticsearch API

In [16]:
import time
from flask import Flask , request
import pandas as pd

app = Flask(__name__)
app.es_client  = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                    ca_certs="./http_ca.crt")

@app.route('/search_es', methods=["GET"])
def search_es():
    start = time.time()
    res = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.es_client.search(index='simple', source_excludes=['url_lists'], size=100, 
                                   query={"match": {"text": query_term}})
    end = time.time()
    total_hit = results['hits']['total']['value']
    results_df = pd.DataFrame([[hit["_source"]['title'], hit["_source"]['url'], hit["_source"]
['text'][:100], hit["_score"]] for hit in results['hits']['hits']], columns=['title', 'url', 'text', 
'score'])
    
    res['total_hit'] = total_hit
    res['results'] = results_df.to_dict('records')
    res['elapse'] = end-start

    return res

### Quick exercise

In [17]:
from ManualIndexer import Indexer, preProcessor

app = Flask(__name__)
app.es_client  = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                    ca_certs="./http_ca.crt")
app.indexer = Indexer()
app.indexer.run_indexer()

@app.route('/manual_index', methods=["GET"])
def manual_index():
    
    start = time.time()
    res = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.indexer.search(query_term)
    end = time.time()
    total_hit = len(results)
    
    res['total_hit'] = total_hit
    res['results'] = results.sort_values("score", ascending=False).drop("url_lists", axis=1).head(100).to_dict('records')
    res['elapse'] = end-start

    return res



In [18]:
app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [12/Feb/2024 18:19:51] "GET /manual_index?query=school HTTP/1.1" 200 -
127.0.0.1 - - [12/Feb/2024 18:20:06] "GET /manual_index?query=school HTTP/1.1" 200 -
