In [7]:
import json
from pathlib import Path
import os
import pickle
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk.corpus import stopwords
# from util import create_custom_preprocessor, create_stem_cache
import BM_ranking
from nltk.stem import PorterStemmer

In [8]:
def preProcessor(s):
    ps = PorterStemmer()
    s = re.sub(r'[^A-Za-z]', ' ', s)
    s = re.sub(r'\s+', ' ' , s)
    s = [word for word in s.split(' ') if len(word) > 2]
    s = [ps.stem(w) for w in s]
    s = ' '.join(s)
    return s

In [9]:
class Indexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.stored_file = 'resource/manual_indexer.pkl'
        if os.path.isfile(self.stored_file):
            with open(self.stored_file, 'rb') as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()

    def run_indexer(self):
        documents = []
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                try:
                    j = json.load(open(os.path.join(self.crawled_folder, file)))
                    documents.append(j)
                except:
                    continue
        self.documents = pd.DataFrame.from_dict(documents)

        # print(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))
        tfidf_vecotorizor = TfidfVectorizer(preprocessor=preProcessor, stop_words=stopwords.words('english'), use_idf=True)
        self.bm25 = BM_ranking.BM25(tfidf_vecotorizor)
        self.bm25.fit(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))
        with open(self.stored_file, 'wb') as f:
            pickle.dump(self.__dict__, f)

    def search(self, query):
        score = self.bm25.transform(query)
        score = pd.DataFrame(score , columns=["score"])
        df = self.documents.join(score)
        df = df.sort_values(by=["score"], ascending=False)
        return df
        # print(self.documents.iloc[rank[:5]].to_markdown())



In [10]:
indexer = Indexer()
indexer.run_indexer()
indexer.search('school')



Unnamed: 0,url,title,text,url_lists,score
196,https://go.camt.cmu.ac.th/index.php/th/2019-05...,Gifted School 2020,"Choose your language ไทย li dir=""ltr"" ...","[http://www.go-camt.com/index.php/th/, http://...",5.105546
119,https://www.grad.cmu.ac.th/index.php?lang=en,"Graduate School, Chiang Mai University",MIdS : Multidisciplinary and Interdisciplinary...,"[https://cmu.to/admission/, https://w3.grad.cm...",5.014414
197,https://service.camt.cmu.ac.th/gifted,Gift School 2023,<< คลิกที่นี่ >> ระบบรับสมัคร Gifted School | ...,[https://service.camt.cmu.ac.th/gifted/gifted/...,4.851804
82,https://go.camt.cmu.ac.th/index.php/th/major/g...,การจัดการความรู้และนวัตกรรม ป.โท,Choose your language ไทย English (UK)...,"[https://go.camt.cmu.ac.th/index.php/th/, http...",4.006397
7,https://go.camt.cmu.ac.th/index.php/th/major/g...,การจัดการความรู้และนวัตกรรม ป.เอก,Choose your language ไทย English (UK)...,"[https://go.camt.cmu.ac.th/index.php/th/, http...",3.942288
...,...,...,...,...,...
83,https://camt.cmu.ac.th/index.php/en/all-downlo...,All Download,Home About us Back Visio...,"[https://camt.cmu.ac.th/index.php/en/, https:/...",0.000000
84,https://camt.cmu.ac.th/index.php/th/หัวข้อกลุ่...,นศ. CAMT ชนะเลิศประกวดโครงร่างวิจัยพลังงาน ระด...,หน้าหลัก รู้จักเรา Back ...,"[https://camt.cmu.ac.th/index.php/th/, https:/...",0.000000
85,http://www.faboba.com,Faboba - Agence web joomla,Connexion Le panier est vide Mon co...,"[https://camt.cmu.ac.th/en/, https://camt.cmu....",0.000000
86,https://camt.cmu.ac.th/index.php/en/index.php#...,Home,Home About us Back Visio...,"[https://camt.cmu.ac.th/index.php/en/, https:/...",0.000000
