In [16]:
import time
import pandas as pd
from flask import Flask , request
from elasticsearch import Elasticsearch
import numpy as np
from flask_cors import CORS

from pathlib import Path
import pickle
import os
import json


### Manual BM25 and PageRank indexer

In [17]:
class Pr:
    def __init__(self, alpha) : 
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.alpha = alpha
    
    def url_extractor(self):
        url_maps = {}
        all_urls = set([])

        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                all_urls.add(j['url'])
                for s in j['url_lists']:
                    all_urls.add(s)
                url_maps[j['url']] = list(set(j['url_lists']))
        all_urls = list(all_urls)
        return url_maps, all_urls
    
    def pr_calc(self):
        url_maps , all_urls = self.url_extractor()
        url_matrix = pd.DataFrame(columns=all_urls, index=all_urls)

        for url in url_maps:
            if len(url_maps[url]) > 0 and len(all_urls) > 0:
                url_matrix.loc[url] = (1 - self.alpha) * (1 / len(all_urls))
                url_matrix.loc[url, url_maps[url]] = url_matrix.loc[url, url_maps[url]] + (self.alpha * (1 / len(url_maps[url]) ))
                url_matrix.loc[url_matrix.isnull().all(axis=1), :] = (1 / len(all_urls))

                x0 = np.matrix([1/len(all_urls)] * len(all_urls))
                P = np.asmatrix(url_matrix.values)

                prev_Px = x0
                Px = x0 * P
                i = 0
                while(any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8)):
                    i += 1
                    prev_Px = Px
                    Px = Px*P
                print('Converged in {0} iterations'.format(i))
                self.pr_result = pd.DataFrame(Px, columns=url_matrix.index, index=['score']).T.loc[list(url_maps.keys())]


In [18]:
from BM_ranking import BM25
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

def preProcessor(s):
    ps = PorterStemmer()
    s = re.sub(r'[^A-Za-z]', ' ', s)
    s = re.sub(r'\s+', ' ' , s)
    s = [word for word in s.split(' ') if len(word) > 2]
    s = [ps.stem(w) for w in s]
    s = ' '.join(s)
    return s


class Indexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.stored_file = 'resource/manual_indexer.pkl'
        if os.path.isfile(self.stored_file):
            with open(self.stored_file, 'rb') as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()

    def run_indexer(self):
        self.pr = Pr(alpha=0.85)
        self.pr.pr_calc()
        documents = []
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                try:
                    j = json.load(open(os.path.join(self.crawled_folder, file)))
                    j['id'] = j['url']
                    j['pagerank'] = self.pr.pr_result.loc[j['id']].score
                    print(j['id'] , j['pagerank'] )
                    documents.append(j)
                except:
                    continue
        self.documents = pd.DataFrame.from_dict(documents)

        # print(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))
        tfidf_vecotorizor = TfidfVectorizer(preprocessor=preProcessor, stop_words=stopwords.words('english'), use_idf=True)
        self.bm25 = BM25(tfidf_vecotorizor)
        self.bm25.fit(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))
        with open(self.stored_file, 'wb') as f:
            pickle.dump(self.__dict__, f)

    def search(self, query):
        score = self.bm25.transform(query)
        score = pd.DataFrame(score , columns=["score"])
        df = self.documents.join(score)
        df = df.sort_values(by=["score", "pagerank"], ascending=False)
        return df

In [19]:
app = Flask(__name__)
app.es_client = Elasticsearch('https://localhost:9200', 
                              basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                              ca_certs="./http_ca.crt")
app.indexer = Indexer()
app.indexer.run_indexer()
cors = CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'


@app.route('/search', methods=['GET'])
def search():
    start = time.time()
    response_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.es_client.search(index='simple', source_excludes=['url_lists'], size=100,
                                   query={"script_score": {"query": {"match": {"text": query_term}},
                                                           "script": {"source": "_score * doc['pagerank'].value"}}})
    end = time.time()
    total_hit = results['hits']['total']['value']
    results_df = pd.DataFrame([[hit["_source"]['title'], hit["_source"]['url'], hit["_source"]['text'][:100], hit["_score"]] 
                               for hit in results['hits']['hits']], columns=['title', 'url', 'text', 'score'])

    response_object['total_hit'] = total_hit
    response_object['results'] = results_df.sort_values("score", ascending=False).head(10).to_dict('records')
    response_object['elapse'] = end - start

    return response_object

@app.route('/manual_index', methods=["GET"])
def manual_index():
    
    start = time.time()
    res = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.indexer.search(query_term)
    end = time.time()
    total_hit = len(results)
    
    res['total_hit'] = total_hit
    res['results'] = results.sort_values("score", ascending=False).drop("url_lists", axis=1).head(10).to_dict('records')
    res['elapse'] = end-start

    return res

Converged in 1 iterations
Converged in 2 iterations
Converged in 2 iterations
Converged in 2 iterations
Converged in 2 iterations
Converged in 2 iterations
Converged in 2 iterations
Converged in 2 iterations
Converged in 2 iterations
Converged in 2 iterations
Converged in 3 iterations
Converged in 3 iterations
Converged in 3 iterations
Converged in 3 iterations
Converged in 3 iterations
Converged in 3 iterations
Converged in 4 iterations
Converged in 5 iterations
Converged in 5 iterations
Converged in 5 iterations
Converged in 5 iterations
Converged in 5 iterations
Converged in 5 iterations
Converged in 5 iterations
Converged in 5 iterations
Converged in 5 iterations
Converged in 5 iterations
Converged in 5 iterations
Converged in 6 iterations
Converged in 6 iterations
Converged in 6 iterations
Converged in 6 iterations
Converged in 6 iterations
Converged in 6 iterations
Converged in 6 iterations
Converged in 6 iterations
Converged in 6 iterations
Converged in 6 iterations
Converged in



In [20]:
app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [25/Feb/2024 22:08:35] "GET /manual_index?query=test HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2024 22:08:58] "GET /manual_index?query=asd HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2024 22:10:57] "GET /manual_index?query=camt HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2024 22:11:08] "GET /manual_index?query=kan HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2024 22:12:14] "GET /manual_index?query=camy HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2024 22:12:28] "GET /manual_index?query=test HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2024 22:12:52] "GET /manual_index?query=test HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2024 22:12:59] "GET /manual_index?query=test HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2024 22:13:17] "GET /manual_index?query=test HTTP/1.1" 200 -
