In [1]:
import numpy as np
x0 = np.matrix([1/7]*7)
P = np.matrix([
        [1/7, 1/7, 1/7, 1/7, 1/7, 1/7, 1/7], 
        [25/56, 3/140, 25/56, 3/140, 3/140, 3/140, 3/140], 
        [3/140, 3/140, 3/140, 3/140, 61/70, 3/140, 3/140], 
        [3/140, 3/140, 25/56, 3/140, 3/140, 3/140, 25/56], 
        [25/56, 3/140, 3/140, 3/140, 3/140, 25/56, 3/140], 
        [3/140, 3/140, 61/70, 3/140, 3/140, 3/140, 3/140], 
        [3/140, 3/140, 25/56, 3/140, 3/140, 25/56, 3/140],
    ]
)

prev_Px = x0
Px = x0*P
i = 0
while(any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8)):
    i+=1
    prev_Px=Px
    Px = Px*P

print('Converged in {0} iterations: {1}'.format(i, np.asarray(Px).flatten()))

Converged in 39 iterations: [0.16911688 0.04196419 0.25324048 0.04196419 0.2572186  0.17669667
 0.05979897]


## PageRank score for crawled webpage

In [2]:
from pathlib import Path
import pickle
import os
import json
import pandas as pd

In [11]:
class Pr:
    def __init__(self, alpha) : 
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.alpha = alpha
    
    def url_extractor(self):
        url_maps = {}
        all_urls = set([])

        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                all_urls.add(j['url'])
                for s in j['url_lists']:
                    all_urls.add(s)
                url_maps[j['url']] = list(set(j['url_lists']))
        all_urls = list(all_urls)
        return url_maps, all_urls
    
    def pr_calc(self):
        url_maps , all_urls = self.url_extractor()
        url_matrix = pd.DataFrame(columns=all_urls, index=all_urls)

        for url in url_maps:
            if len(url_maps[url]) > 0 and len(all_urls) > 0:
                url_matrix.loc[url] = (1 - self.alpha) * (1 / len(all_urls))
                url_matrix.loc[url, url_maps[url]] = url_matrix.loc[url, url_maps[url]] + (self.alpha * (1 / len(url_maps[url]) ))
                url_matrix.loc[url_matrix.isnull().all(axis=1), :] = (1 / len(all_urls))

                x0 = np.matrix([1/len(all_urls)] * len(all_urls))
                P = np.asmatrix(url_matrix.values)

                prev_Px = x0
                Px = x0 * P
                i = 0
                while(any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8)):
                    i += 1
                    prev_Px = Px
                    Px = Px*P
                print('Converged in {0} iterations: {1}'.format(i, np.asarray(Px).flatten()))
                self.pr_result = pd.DataFrame(Px, columns=url_matrix.index, index=['score']).T.loc[list(url_maps.keys())]


In [12]:
s = Pr(alpha=0.85)
s.pr_calc()

Converged in 1 iterations: [0.0005774172128754898 0.0005774172128754898 0.0005774172128754898 ...
 0.0005774172128754898 0.0005774172128754898 0.0005774172128754898]
Converged in 2 iterations: [0.0005771339527899262 0.0005771339527899262 0.0005771339527899262 ...
 0.0005771339527899262 0.0005771339527899262 0.0005771339527899262]
Converged in 2 iterations: [0.000576850970549038 0.000576850970549038 0.000576850970549038 ...
 0.000576850970549038 0.000576850970549038 0.000576850970549038]
Converged in 2 iterations: [0.0005765682656740766 0.0005765682656740766 0.0005765682656740766 ...
 0.0005765682656740766 0.0005765682656740766 0.0005765682656740766]
Converged in 2 iterations: [0.0005762858377545928 0.0005762858377545928 0.0005762858377545928 ...
 0.0005762858377545928 0.0005762858377545928 0.0005762858377545928]
Converged in 2 iterations: [0.0005760036863802227 0.0005760036863802227 0.0005760036863802227 ...
 0.0005760036863802227 0.0005760036863802227 0.0005760036863802227]
Converged 

In [6]:
s.pr_result.sort_values(by='score', ascending=False)

Unnamed: 0,score
https://camt.cmu.ac.th/index.php/en/,0.007178
https://camt.cmu.ac.th/,0.005282
https://camt.cmu.ac.th/index.php/th/,0.004654
http://go.camt.cmu.ac.th,0.004486
https://oauth.cmu.ac.th/v1/Authorize.aspx?response_type=code&client_id=3pefrhnrAcsu4VSAexA4XW98a3d9cf4tE7a2QD09&redirect_uri=https://service.camt.cmu.ac.th/wilstu/home/callback&scope=cmuitaccount.basicinfo&state=xyx,0.004486
...,...
https://camt.cmu.ac.th/index.php/ข้อมูลคณะ/รายชื่อบุคลากร.html,0.000508
https://www.instagram.com/camt.cmu,0.000508
https://linevoom.line.me/user/_dTwqRiMksixOngcEgNzK_TqY-lNLf_ucd7_uYt4?utm_medium=windows&utm_source=desktop&utm_campaign=OA_Profile,0.000508
https://cmu.to/wmBQl,0.000508


## Integrate PageRank with es

In [7]:
from elasticsearch import Elasticsearch

In [8]:
class ElasticIndexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        print(self.crawled_folder  / 'url_list.pickle')
        with open(self.crawled_folder / 'url_list.pickle', 'rb') as f:
            self.file_mapper = pickle.load(f) 
        self.es_client = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                    ca_certs="./http_ca.crt")

    def run_indexer(self):
        self.pr = Pr(alpha=0.85)
        self.pr.pr_calc()
        self.es_client.indices.create(index='simple', ignore=400)
        self.es_client.indices.delete(index='simple', ignore=[400, 404])

        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                j['id'] = j['url']
                j['pagerank'] = self.pr.pr_result.loc[j['id']].score
                print(j['id'] , j['pagerank'] )
                self.es_client.index(index='simple', body=j)

In [9]:
indexer = ElasticIndexer()
indexer.run_indexer()

c:\SoftwareEngineer\Work\ir\crawled\url_list.pickle
Converged in 1 iterations: [0.0005774172128754898 0.0005774172128754898 0.0005774172128754898 ...
 0.0005774172128754898 0.0005774172128754898 0.0005774172128754898]
Converged in 2 iterations: [0.0005771339527899262 0.0005771339527899262 0.0005771339527899262 ...
 0.0005771339527899262 0.0005771339527899262 0.0005771339527899262]
Converged in 2 iterations: [0.000576850970549038 0.000576850970549038 0.000576850970549038 ...
 0.000576850970549038 0.000576850970549038 0.000576850970549038]
Converged in 2 iterations: [0.0005765682656740766 0.0005765682656740766 0.0005765682656740766 ...
 0.0005765682656740766 0.0005765682656740766 0.0005765682656740766]
Converged in 2 iterations: [0.0005762858377545928 0.0005762858377545928 0.0005762858377545928 ...
 0.0005762858377545928 0.0005762858377545928 0.0005762858377545928]
Converged in 2 iterations: [0.0005760036863802227 0.0005760036863802227 0.0005760036863802227 ...
 0.0005760036863802227 0.0

  self.es_client.indices.create(index='simple', ignore=400)
  self.es_client.indices.delete(index='simple', ignore=[400, 404])


https://www.instagram.com/camt.cmu/?hl=th 0.0006590913556461115
https://go.camt.cmu.ac.th/index.php/th/apply-for-study/apply-master 0.0005291709411329443
https://camt.cmu.ac.th/index.php/en/all-news-groups/56-research-and-innovation-funding.html 0.0007574953121793482
https://camt.cmu.ac.th/index.php/en/all-download/category/50-ข้อบังคับ-ระเบียบ-ประกาศมหาวิทยาลัยเชียงใหม่-วิจัยและบริการวิชาการ.html 0.0005425314936301137
https://camt.cmu.ac.th/index.php/th/หัวข้อกลุ่มข่าวทั้งหมด/54-ข่าวประกาศจากวิทยาลัยฯ/1110-change_room_954474.html 0.0006106686302683983
https://camt.cmu.ac.th/index.php/en/all-news-groups/24-ข่าวทั่วไป/1125-นศ-camt-ชนะเลิศประกวดโครงร่างวิจัยพลังงาน-ระดับอุดมศึกษา-ประเภท-software-innovation-ภายใต้แนวคิด-new-gen-energy-research-showcase.html 0.0007169365138486879
https://support.google.com/maps/?hl=en&authuser=0&p=no_javascript 0.0042841841943746925
https://go.camt.cmu.ac.th/index.php/th/major/graduate/doctoral-km 0.0005291709411329443
https://camt.cmu.ac.th/index.php/en/s

## Integrate with Flask

In [14]:
import time
from flask import Flask , request

app = Flask(__name__)
app.es_client = Elasticsearch('https://localhost:9200', 
                              basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                              ca_certs="./http_ca.crt")

@app.route('/search', methods=['GET'])
def search():
    start = time.time()
    response_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.es_client.search(index='simple', source_excludes=['url_lists'], size=100,
                                   query={"script_score": {"query": {"match": {"text": query_term}},
                                                           "script": {"source": "_score * doc['pagerank'].value"}}})
    end = time.time()
    total_hit = results['hits']['total']['value']
    results_df = pd.DataFrame([[hit["_source"]['title'], hit["_source"]['url'], hit["_source"]['text'][:100], hit["_score"]] 
                               for hit in results['hits']['hits']], columns=['title', 'url', 'text', 'score'])

    response_object['total_hit'] = total_hit
    response_object['results'] = results_df.to_dict('records')
    response_object['elapse'] = end - start

    return response_object

In [15]:
app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [19/Feb/2024 16:57:18] "GET /search?query=camt HTTP/1.1" 200 -
127.0.0.1 - - [19/Feb/2024 16:57:27] "GET /search?query=กานต์ HTTP/1.1" 200 -
127.0.0.1 - - [19/Feb/2024 16:57:32] "GET /search?query=kan HTTP/1.1" 200 -
127.0.0.1 - - [19/Feb/2024 16:57:36] "GET /search?query=SE HTTP/1.1" 200 -
127.0.0.1 - - [19/Feb/2024 16:57:51] "GET /search?query=Pattama HTTP/1.1" 200 -
127.0.0.1 - - [19/Feb/2024 16:57:56] "GET /search?query=School HTTP/1.1" 200 -
