In [5]:
import json
from pathlib import Path
import os
import pickle
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk.corpus import stopwords
from util import create_custom_preprocessor, create_stem_cache
import BM_ranking
from nltk.stem import PorterStemmer

In [6]:
def preProcessor(s):
    ps = PorterStemmer()
    s = re.sub(r'[^A-Za-z]', ' ', s)
    s = re.sub(r'\s+', ' ' , s)
    s = [word for word in s.split(' ') if len(word) > 2]
    s = [ps.stem(w) for w in s]
    s = ' '.join(s)
    return s

In [7]:
class Indexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.stored_file = 'resource/manual_indexer.pkl'
        if os.path.isfile(self.stored_file):
            with open(self.stored_file, 'rb') as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()

    def run_indexer(self):
        documents = []
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                try:
                    j = json.load(open(os.path.join(self.crawled_folder, file)))
                    documents.append(j)
                except:
                    continue
        self.documents = pd.DataFrame.from_dict(documents)

        # print(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))
        tfidf_vecotorizor = TfidfVectorizer(preprocessor=preProcessor, stop_words=stopwords.words('english'), use_idf=True)
        self.bm25 = BM_ranking.BM25(tfidf_vecotorizor)
        self.bm25.fit(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))
        with open(self.stored_file, 'wb') as f:
            pickle.dump(self.__dict__, f)

    def search(self, query):
        score = self.bm25.transform(query)
        score = pd.DataFrame(score , columns=["score"])
        df = self.documents.join(score)
        df = df.sort_values("score", ascending=False)
        return df
        # print(self.documents.iloc[rank[:5]].to_markdown())



In [8]:
indexer = Indexer()
indexer.run_indexer()
indexer.search('school')



Unnamed: 0,url,title,text,url_lists,score
340,https://go.camt.cmu.ac.th/index.php/th/2019-05...,Gifted School 2020,"Choose your language ไทย li dir=""ltr"" ...","[http://www.go-camt.com/index.php/th/, http://...",5.369585
211,https://www.grad.cmu.ac.th/,"Graduate School, Chiang Mai University",MIdS : Multidisciplinary and Interdisciplinary...,"[https://cmu.to/admission/, https://w3.grad.cm...",5.229049
256,https://www.grad.cmu.ac.th,"Graduate School, Chiang Mai University",MIdS : Multidisciplinary and Interdisciplinary...,"[https://cmu.to/admission/, https://w3.grad.cm...",5.229049
31,https://www.grad.cmu.ac.th/index.php?lang=en,"Graduate School, Chiang Mai University",MIdS : Multidisciplinary and Interdisciplinary...,"[https://cmu.to/admission/, https://w3.grad.cm...",5.229049
135,https://service.camt.cmu.ac.th/gifted,Gift School 2023,<< คลิกที่นี่ >> ระบบรับสมัคร Gifted School | ...,[https://service.camt.cmu.ac.th/gifted/gifted/...,5.042515
...,...,...,...,...,...
117,https://camt.cmu.ac.th/index.php/th/หัวข้อกลุ่...,คณะผู้บริหารและอาจารย์ CAMT ร่วมถ่ายภาพหมู่แล...,หน้าหลัก รู้จักเรา Back ...,"[https://camt.cmu.ac.th/index.php/th/, https:/...",0.000000
116,https://camt.cmu.ac.th/index.php/th/เกี่ยวกับเ...,fin cafe ครั้งที่ 7 ระบบสำรวจความต้องการวัสดุ ...,หน้าหลัก เกี่ยวกับเรา Back ...,"[https://lin.ee/VlA5L51, https://camt.cmu.ac.t...",0.000000
115,https://www.google.com/forms/about/?utm_source...,Google Forms: Online Form Creator | Google Wor...,Jump to Content Forms Sign...,"[https://camt.cmu.ac.th/index.php/en/#content,...",0.000000
114,https://camt.cmu.ac.th/index.php/en/all-news-g...,CAMT ร่วม MOU โรงเรียนเชียงใหม่คริสเตียน,Home About us Back Visio...,"[https://camt.cmu.ac.th/index.php/en/, https:/...",0.000000
