# Information Retrival Final Project

#### Elahe Habibzadeh Shojaee 9931014

--------------------------------------------------------------------------------------------------------------------------------------------------------

## Imports

In [1]:
import hazm
import parsivar
from string import punctuation
import json
import re
import numpy as np
from pathlib import Path
from collections import Counter
import collections

## Load Documents

In [2]:
def load_docs(): 
    docs = {}
    contents = []
    urls = []
    with open("IR_data_news_12k.json", 'r') as file:
        docs = json.load(file)
        for key in docs.keys():
            # index of files
            idx = str(key)
            # extract and save url, title and content of each doc
            docs[idx] = {'title': docs[idx]['title'],
                             'content': docs[idx]['content'],
                             'url': docs[idx]['url'],
                            }
            
            contents.append(docs[idx]['content'])
    return docs, contents, urls

In [3]:
docs, contents, urls = load_docs()
docs['6']

{'title': 'اعلام برنامه نشست خبری گل محمدی/ مجیدی هم باید به محل تمرین پرسپولیس برود!',
 'content': '\nبه گزارش خبرگزاری فارس و به نقل از\xa0 سایت باشگاه پرسپولیس، نشست خبری پیش از مسابقه سرمربیان دو تیم پرسپولیس و استقلال از هفته بیست و سوم لیگ برتر (جام خلیج فارس) با\xa0 مدیریت سازمان لیگ و هماهنگی\xa0باشگاه میزبان (پرسپولیس) در ورزشگاه شهید کاظمی به شرح زیر\xa0 برگزار می\u200cشود: چهارشنبه ۲۵ اسفند ساعت ۱۳ فرهاد مجیدی سرمربی استقلال ساعت ۱۳:۳۰ یحیی گل محمدی سرمربی پرسپولیس \xa0مسابقه دو تیم روز پنجشنبه در ورزشگاه آزادی برگزار می\u200cشود. \xa0به گزارش خبرگزاری فارس، پیش از این باشگاه استقلال اعلام کرده بود قرار است نشست خبری فرهاد مجیدی، سرمربی این تیم از ساعت ۱۵ در سازمان لیگ برگزار شود. انتهای پیام /\n\n\n',
 'url': 'https://www.farsnews.ir/news/14001224000865/اعلام-برنامه-نشست-خبری-گل-محمدی-مجیدی-هم-باید-به-محل-تمرین-پرسپولیس'}

In [4]:
import re

class DataNormalization:
    def __init__(self):
        # pattern for matching mi in start of token
        self.mi_patterns = r"\bن?می[آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]+"
        
        # punctuation marks
        self.punc_after = r"\.:!،؛؟»\]\)\}"
        self.punc_before = r"«\[\(\{"
        self.all_punc_marks = r"[\.:!،؛؟»\'\]\)\}|«\[\(\/\{><+\-?!=_]"

        self.number_not_persian = "0123456789%٠١٢٣٤٥٦٧٨٩"
        self.number_persian = "۰۱۲۳۴۵۶۷۸۹٪۰۱۲۳۴۵۶۷۸۹"
        
        # fathe kasre ,....
        self.arabic_patterns = [
                    ("[\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652]", ""),
                    ("[ك]",'ک'),
                    ("[ي]","ی"),
                    ("[هٔ]","ه"),
                    ("[أ]","ا"),
            
                ]
        self.punctuation_spacing_patterns = [
                # remove space before and after quotation
                ('" ([^\n"]+) "', r'"\1"'),
                (" ([" + self.punc_after + "])", r"\1"),  # remove space before
                ("([" + self.punc_before + "]) ", r"\1"),  # remove space after
                # put space after . and :
                (
                    "([" + self.punc_after[:3] + "])([^ " + self.punc_after + r"\d۰۱۲۳۴۵۶۷۸۹])",
                    r"\1 \2",
                ),
                (
                    "([" + self.punc_after[3:] + "])([^ " + self.punc_after + "])",
                    r"\1 \2",
                ),  # put space after
                (
                    "([^ " + self.punc_before + "])([" + self.punc_before + "])",
                    r"\1 \2",
                ),  # put space before
                # put space after number
                (r"(\d)([آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی])", r"\1 \2"),
                # put space after number
                (r"([آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی])(\d)", r"\1 \2"),
            ]

        # some special unicodes which should be replaced by persian terms
        self.unicode_replacements = [
                    ("﷽"," بسم الله الرحمن الرحیم "),(" ﷼", " ریال"),
                    ("(ﷰ|ﷹ)", " صلی "),
                    (" ﷲ", " الله"),
                    (" ﷳ", " اکبر"),
                    (" ﷴ", " محمد"),
                    (" ﷵ", " صلعم"),
                    (" ﷶ", " رسول"),
                    (" ﷷ", " علیه"),
                    (" ﷸ", " وسلم"),
                    (" ﻵ|ﻶ|ﻷ|ﻸ|ﻹ|ﻺ|ﻻ|ﻼ", " لا"),
                ]
        # extra space patterns
        self.extra_space_patterns = [
            (r" {2,}", " "),           # remove extra spaces
            (r"\n{3,}", "\n\n"),       # remove extra newlines
            (r"\u200c{2,}", "\u200c"), # remove extra ZWNJs
            (r"\u200c{1,} ", " "),     # remove unneeded ZWNJs before space
            (r" \u200c{1,}", " "),     # remove unneeded ZWNJs after space
            (r"\b\u200c*\B", " "),      # remove unneeded ZWNJs at the beginning of words
            (r"\B\u200c*\b", " "),      # remove unneeded ZWNJs at the end of words
            (r"[ـ\r]", " "),           # remove keshide, carriage returns
        ]

        # space patterns for nimfasele
        self.spacing_patterns = [
            (r"\xa0"," "),  # remove no-break char
            (r"([^ ]) ی ", r"\1‌ی "),          # fix 'ی' space
            (r"(^| )(ن?می) ", r"\1\2‌"),        # fix 'می' and 'نمی' space
            (r"(?<=[^\n\d" + self.punc_after + self.punc_before + r"]{2}) (تر(ین?)?|گری?|های?)(?=[ \n" + self.punc_after + self.punc_before + r"]|$)", r"‌\1"),
            # fix suffix spacing
            (r"([^ ]ه) (ا(م|یم|ش|ند|ی|ید|ت))(?=[ \n" + self.punc_after + r"]|$)", r"\1‌\2"),  # fix verb conjugation spacing
            (r"(ه)(ها)", r"\1‌\2"),  
        ]

        # bons of verbs
        with Path('verbs.dat').open(encoding="utf8") as verbs_file:
                verbs = list(
                    reversed([verb.strip() for verb in verbs_file if verb]),
                )
                self.present_bons = {verb[1:].split("#")[0].strip() for verb in verbs[1:]}
                self.past_bons = {verb.split("#")[1] for verb in verbs}


    @staticmethod
    def regex_replace(patterns: list, text: str) -> str:
        for pattern, repl in patterns:
            text = re.sub(pattern, repl, text)
        return text

    # fix spacings
    def spacing_correction(self, text: str) -> str:
        text = self.regex_replace(self.extra_space_patterns, text)
        text = self.regex_replace(self.punctuation_spacing_patterns, text)
        text = self.regex_replace(self.spacing_patterns, text)
        return text

    # repplace special charaters
    def unicode_replacement(cls, text: str) -> str:
        for old, new in cls.unicode_replacements:
            text = re.sub(old, new, text)
        return text
        
    # convert numbers to persian numbers
    def persian_number(cls, text: str) -> str:
        translation_table = str.maketrans(
            cls.number_not_persian,
            cls.number_persian )
        translated_text = text.translate(translation_table)
        return translated_text
     
    # remove puctuation marks and arabic chars
    def remove_special_chars(cls, text: str) -> str:
        text = cls.remove_punc_marks(text)
        text = cls.remove_arabic_chars(text)
        return text

    # remove some arabic chars
    def remove_arabic_chars(cls, text: str) -> str:
        return cls.regex_replace(cls.arabic_patterns, text)

    # remove puctuation marks
    def remove_punc_marks(cls, text: str) -> str:
        return re.sub(cls.all_punc_marks, "", text)

    # seperate mi in start of verbs
    def seperate_mi(cls, text:str) -> str:
        matches = re.findall(cls.mi_patterns, text)
        for m in matches:
            r = re.sub("^(ن?می)", r"\1‌", m)
            # remove mi from token to check it contains the bon of a verb or not
            x = re.sub("^(ن?می)", "", m)
            for verb in cls.present_bons:
                if verb in x:
                    text = text.replace(m, r)
            for verb in cls.past_bons:
                if verb in x:
                    text = text.replace(m, r)
        return text

    # general normalization method to perform all above functions
    def normalize(cls, text:str) -> str:
        text = cls.remove_special_chars(text)
        text = cls.seperate_mi(text)
        text = cls.persian_number(text)
        text = cls.unicode_replacement(text)
        text = cls.spacing_correction(text)
        return text

## Normalization Test

In [5]:
normalizer = DataNormalization()
print(normalizer.seperate_mi("میرفتم"))
print(normalizer.spacing_correction("به نام های خدا‌ ی  درباره ی مهربان ترین"))
print(normalizer.normalize("به نام های  ﷲ خدا‌ ی 99 % درباره ی ً ٌ ٍسلامَ ُ ِ ّ ْ+ئئئ  مقابله أ ﷽مهربان >. ﷽  . ترین"))
print(normalizer.normalize("«العين التي تمتل بك لن تنظر لغيرك...»"))
print(normalizer.normalize("'ابقَ قویّا، خالهٔ فَقِصّتُکَ لم تَنتَهی بعد ..' "))


می‌رفتم
به نام‌های خدا‌ی درباره‌ی مهربان‌ترین
به نام‌های الله خدا‌ی ۹۹ ٪ درباره‌ی سلام ئئئ مقابله ا بسم الله الرحمن الرحیم مهربان بسم الله الرحمن الرحیم‌ترین
العین التی تمتل بک لن تنظر لغیرک
ابق قویا خالهه فقصتک لم تنتهی بعد 


## Preprocessing

In [6]:
class DataPreprocessing:
    top_k = {}
    pattern = re.compile(r'([؟!?]+|[\d.:]+|[:.،؛»\])}"«\[({/\\])')
    after_verbs = {
                "ام",
                "ای",
                "است",
                "ایم",
                "اید",
                "اند",
                "بودم",
                "بودی",
                "بود",
                "بودیم",
                "بودید",
                "بودند",
                "باشم",
                "باشی",
                "باشد",
                "باشیم",
                "باشید",
                "باشند",
                   "شده",
            "نشده",
                "شوم",
                "شوی",
                "شود",
                "شویم",
                "شوید",
                "شوند",
                "شدم",
                "شدی",
                "شد",
                "شدیم",
                "شدید",
                "شدند",
                "نشوم",
                "نشوی",
                "نشود",
                "نشویم",
                "نشوید",
                "نشوند",
                "نشدم",
                "نشدی",
                "نشد",
                "نشدیم",
                "نشدید",
                "نشدند",
                "می‌شوم",
                "می‌شوی",
                "می‌شود",
                "می‌شویم",
                "می‌شوید",
                "می‌شوند",
                "می‌شدم",
                "می‌شدی",
                "می‌شد",
                "می‌شدیم",
                "می‌شدید",
                "می‌شدند",
                "نمی‌شوم",
                "نمی‌شوی",
                "نمی‌شود",
                "نمی‌شویم",
                "نمی‌شوید",
                "نمی‌شوند",
                "نمی‌شدم",
                "نمی‌شدی",
                "نمی‌شد",
                "نمی‌شدیم",
                "نمی‌شدید",
                "نمی‌شدند",
               
            }

    before_verbs = {
                "خواهم",
                "خواهی",
                "خواهد",
                "خواهیم",
                "خواهید",
                "خواهند",
                "نخواهم",
                "نخواهی",
                "نخواهد",
                "نخواهیم",
                "نخواهید",
                "نخواهند",
            }
    vere = {}
    def __init__(self):
        # save terms like گفته ، خورده which are bon mazi + ه
        with Path('verbs.dat').open(encoding="utf8") as verbs_file:
                verbs = list(
                    reversed([verb.strip() for verb in verbs_file if verb]),
                )
                DataPreprocessing.verbe = {(verb.split("#")[0] + 'ه') for verb in verbs}
     
    #Tokenization
    @staticmethod
    def Tokenization(text):
        text = DataPreprocessing.pattern.sub(r" \1 ", text.replace("\n", " ").replace("\t", " "))
        tokens = [word for word in text.split(" ") if word]
        tokens_cleaned = [token.strip('\xa0') for token in tokens if len(token.strip()) != 0]

        result = [""]
        # merge multi term verbs like خواهم رفت to خواهم_رفت
        for token in reversed(tokens_cleaned):
            if token in DataPreprocessing.before_verbs or (
                result[-1] in DataPreprocessing.after_verbs and token in DataPreprocessing.verbe
            ):
                result[-1] = token + "_" + result[-1]
            else:
                result.append(token)
        return list(reversed(result[1:]))
    
    #Normalization
    @staticmethod
    def Normalization(text):
        my_normalizer = DataNormalization()
        return my_normalizer.normalize(text)
    
    #Stop_Words
    @staticmethod
    def Top_K_Frequent(tokens,k):
        token_counts = Counter(tokens)
        sorted_tokens = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
        stopwords_to_remove = [token for token, count in sorted_tokens[:k]]
        report = {token: count for token, count in sorted_tokens[:k]}
        return report
        
    # print top k frequent terms
    def print_top_k(self):
        for token, count in self.top_k.items():
            print(f"Token: {token}, Count: {count}")
            
    #Stemming
    @staticmethod
    def Stemming(tokens):
        stemmed = []
        my_stemmer = parsivar.FindStems()
        for token in tokens:
            stemmed.append(my_stemmer.convert_to_stem(token))
        return stemmed
    
    #Remove Punctuations
    @staticmethod
    def Remove_Punctuations(text):
        return re.sub(f'[{punctuation}؟،٪×÷»«]+', '', text)

    # preprocess a text and return final tokens
    def simple_preprocess(self, content):
        punctuated_content = self.Remove_Punctuations(content)
        normalized_content = self.Normalization(punctuated_content)
        tokens_of_a_sentence = self.Tokenization(normalized_content)
        final_tokens_of_a_sentence = self.Stemming(tokens_of_a_sentence)
        self.top_k
        tokens = [token for token in final_tokens_of_a_sentence if token not in self.top_k]
        return tokens
        
    # method to tokenize a text
    def tokenize(self, text):
        return self.Tokenization(text)

    # preprocess all given docs
    def preprocess(self, docs):
        tokens = []
        self.print_top_k()
        counter = 0
        for idx in docs.keys():
            content = docs[str(idx)]['content']
            punctuated_content = self.Remove_Punctuations(content)            
            normalized_content = self.Normalization(punctuated_content)
            all_tokens = self.Tokenization(normalized_content)
            stemmed_tokens = self.Stemming(all_tokens)
            docs[str(idx)]['content'] = stemmed_tokens
            tokens += stemmed_tokens
            counter += 1
            # print progress
            if counter % 1000 == 0:
                print(counter, ' docs processed')
        # save top k frequent
        self.top_k = self.Top_K_Frequent(tokens, 20)
        # remove stop words from doc tokens
        for doc_id, doc_content in docs.items():
            docs[doc_id]['content'] = [token for token in doc_content['content'] if token not in self.top_k]
        return docs

In [7]:
global preprocessor
preprocessor = DataPreprocessing()

### Load and Preprocess Docs

In [8]:
docs, contents, urls = load_docs()

In [9]:
pre_processed_docs = preprocessor.preprocess(docs)

1000  docs processed
2000  docs processed
3000  docs processed
4000  docs processed
5000  docs processed
6000  docs processed
7000  docs processed
8000  docs processed
9000  docs processed
10000  docs processed
11000  docs processed
12000  docs processed


### Print Stop Words

In [10]:
preprocessor.print_top_k()

Token: و, Count: 219205
Token: در, Count: 164329
Token: به, Count: 133277
Token: از, Count: 92930
Token: این, Count: 82976
Token: که, Count: 76240
Token: با, Count: 68994
Token: را, Count: 67489
Token: کرد&کن, Count: 45104
Token: اس, Count: 44652
Token: برای, Count: 30996
Token: داشت&دار, Count: 30052
Token: تیم, Count: 27692
Token: شد&شو, Count: 26575
Token: کرد, Count: 22936
Token: هم, Count: 22393
Token: کشور, Count: 21730
Token: ما, Count: 19717
Token: یک, Count: 18733
Token: بود&باش, Count: 18028


### Example of Preprocessed Doc Content

In [11]:
pre_processed_docs['10']

{'title': 'تراکتور در آستانه بازگشت به خانه اصلی/ نواقص ورزشگاه یادگار امام بررسی شد',
 'content': ['گزارش',
  'خبرگزاری',
  'فارس',
  'نقل',
  'سایت',
  'باشگاه',
  'تراکتور',
  'میر',
  'معصوم',
  'سهراب',
  'مدیرعامل',
  'باشگاه',
  'همراه',
  'نمایندگانی',
  'استانداری',
  'آذربایجان',
  'شرقی',
  'معاون',
  'عملیات',
  'نیرو',
  'انتظامی',
  'استان',
  'نمایندگانی',
  'یگان',
  'ویژه',
  'استان',
  'مدیرکل',
  'ورزش',
  'جوان',
  'استان',
  'نماینده',
  'اداره',
  'ورزش',
  'جوان',
  'شهرستان',
  'تبریز',
  'مدیر',
  'ورزشگاه',
  'یادگار',
  'امام',
  'امروز',
  'سه',
  'شنبه',
  '۲۴',
  'اسفند',
  'بخش',
  'مختلف',
  'ورزشگاه',
  'یادگار',
  'امام',
  'ره',
  'تبریز',
  'بازدید',
  'کرده',
  'آخرین',
  'وضعیت',
  'نواقصات',
  'مجموعه',
  'میزبانی',
  'هوادار',
  'تراکتور',
  'بررسی',
  'طی',
  'بازدید',
  'تمهیدات',
  'لازم',
  'راستا',
  'آماده\u200cسازی',
  'ورزشگاه',
  'میزبانی',
  'بازی',
  'تراکتور',
  'اندیشیده_شد',
  'انتهای',
  'پیام'],
 'url': 'https://www.farsnews.ir/ne

## Positional Inverted Matrix

In [12]:
def Postings_List(Docs, champ_len):
    my_dict = {}
    for index in Docs:
        for position, token in enumerate(Docs[index]['content']):
            # if its not new token
            if token in my_dict:
                # if the doc has already been in posting list of that token
                if index in my_dict[token]['docs']:
                    my_dict[token]['docs'][index]['positions'].append(position)
                    my_dict[token]['docs'][index]['number_of_token'] += 1 
                else:
                    my_dict[token]['docs'][index] = { 
                                                    'positions': [position],
                                                    'number_of_token': 1
                                                    }
                my_dict[token]['frequency'] += 1
                
            # add token to dictionary if its new
            else:
                my_dict[token] = {
                 'frequency': 1,
                 'docs': {
                       index: {
                           'positions': [position],
                           'number_of_token': 1
                           }
                    }
                }
                
    N = len(Docs)
    docs_vectors = {}
    
    for term in my_dict:
        term_docs = dict(my_dict[term]['docs'])
        n_t = len(term_docs)
        # calculting tf_idf for each doc in posting list
        for doc in my_dict[term]['docs']:
            tf = my_dict[term]['docs'][doc]['number_of_token']
            x = (np.log10( N / n_t ))*(1+np.log10(tf))
            my_dict[term]['docs'][doc]['tf_idf'] = x
            
            # add weight of this term to docs vector for future usages in query processing
            if doc not in docs_vectors:
                docs_vectors[doc] = {}
            docs_vectors[doc][term] = {'tf_idf':x,'tf':tf}

            
        # sort posting list and put it in champion list of each term
        sorted_term_docs = sorted(term_docs, key=lambda doc: term_docs[doc]['number_of_token'], reverse=True)
        my_dict[term]['champions_list'] = {}
        for doc_number in sorted_term_docs:
            my_dict[term]['champions_list'][doc_number] = {'number_of_token': my_dict[term]['docs'][doc_number]['number_of_token']
                                                           ,'tf_idf' : my_dict[term]['docs'][doc_number]['tf_idf']}
            
        # if champ_len is smaller than actual postings, we extract top champ_len of sorted postings 
        if champ_len < n_t:
            my_dict[term]['champions_list'] = dict(list(my_dict[term]['champions_list'].items())[:champ_len])
        
    return my_dict , docs_vectors

### Create Dictionary

In [13]:
global dictionary, docs_vectors
dictionary, docs_vectors = Postings_List(pre_processed_docs, 20)

### Test Dictionary

In [14]:
docs_vectors['8535']

{'گزارش': {'tf_idf': 0.06586258585500605, 'tf': 1},
 'خبرگزاری': {'tf_idf': 0.007249774608743844, 'tf': 1},
 'ماه': {'tf_idf': 0.7600951597276173, 'tf': 1},
 'مراسم': {'tf_idf': 1.2864016614122344, 'tf': 1},
 'برگزار': {'tf_idf': 0.8213161665267152, 'tf': 2},
 'عنوان': {'tf_idf': 0.4918175114962707, 'tf': 1},
 'مقام': {'tf_idf': 1.11330316705667, 'tf': 1},
 'انتهای': {'tf_idf': 0.004723750559019523, 'tf': 1},
 'پیام': {'tf_idf': 0.003430351079835483, 'tf': 1},
 'جوان': {'tf_idf': 0.938136923221623, 'tf': 1},
 'برتر': {'tf_idf': 1.0520033409128284, 'tf': 2},
 'سازمان': {'tf_idf': 0.8321246883250831, 'tf': 1},
 'ملی': {'tf_idf': 0.5231873195159706, 'tf': 1},
 'سوم': {'tf_idf': 1.0765553869442086, 'tf': 1},
 'زیر': {'tf_idf': 0.805625092262702, 'tf': 1},
 'برنامه': {'tf_idf': 0.6990411943176393, 'tf': 1},
 'ورزش': {'tf_idf': 1.1264361823279525, 'tf': 1},
 'امور': {'tf_idf': 1.012712670310246, 'tf': 1},
 'پذیرفت&پذیر': {'tf_idf': 1.3192751545741883, 'tf': 1},
 'رقابت': {'tf_idf': 0.8514078

In [15]:
dictionary['فارس']['champions_list']

{'163': {'number_of_token': 35, 'tf_idf': 0.01201754284668332},
 '6404': {'number_of_token': 34, 'tf_idf': 0.011958074949524491},
 '11697': {'number_of_token': 33, 'tf_idf': 0.011896831631396633},
 '1322': {'number_of_token': 31, 'tf_idf': 0.011768571193969471},
 '1633': {'number_of_token': 31, 'tf_idf': 0.011768571193969471},
 '6755': {'number_of_token': 26, 'tf_idf': 0.011407731702494402},
 '7584': {'number_of_token': 26, 'tf_idf': 0.011407731702494402},
 '821': {'number_of_token': 24, 'tf_idf': 0.011243524183619751},
 '2388': {'number_of_token': 24, 'tf_idf': 0.011243524183619751},
 '7744': {'number_of_token': 24, 'tf_idf': 0.011243524183619751},
 '5395': {'number_of_token': 23, 'tf_idf': 0.011156213185640074},
 '525': {'number_of_token': 19, 'tf_idf': 0.010764263596368676},
 '8680': {'number_of_token': 16, 'tf_idf': 0.010411713000217027},
 '9687': {'number_of_token': 16, 'tf_idf': 0.010411713000217027},
 '10183': {'number_of_token': 16, 'tf_idf': 0.010411713000217027},
 '2098': {'n

In [16]:
dictionary['کمیسیون']

{'frequency': 6364,
 'docs': {'13': {'positions': [81, 94],
   'number_of_token': 2,
   'tf_idf': 1.0810483414120848},
  '89': {'positions': [295],
   'number_of_token': 1,
   'tf_idf': 0.8309173078368354},
  '100': {'positions': [157],
   'number_of_token': 1,
   'tf_idf': 0.8309173078368354},
  '106': {'positions': [36, 402, 418],
   'number_of_token': 3,
   'tf_idf': 1.2273656163202302},
  '140': {'positions': [488],
   'number_of_token': 1,
   'tf_idf': 0.8309173078368354},
  '461': {'positions': [449],
   'number_of_token': 1,
   'tf_idf': 0.8309173078368354},
  '465': {'positions': [283],
   'number_of_token': 1,
   'tf_idf': 0.8309173078368354},
  '472': {'positions': [299],
   'number_of_token': 1,
   'tf_idf': 0.8309173078368354},
  '479': {'positions': [55, 67],
   'number_of_token': 2,
   'tf_idf': 1.0810483414120848},
  '545': {'positions': [34, 68],
   'number_of_token': 2,
   'tf_idf': 1.0810483414120848},
  '708': {'positions': [13],
   'number_of_token': 1,
   'tf_idf':

### Dictionary Length

In [17]:
len(dictionary)

50440

### Save Dictionary 

#### Save Json to explore dictionary terms and structure

In [18]:
import json

# Split the dictionary into two parts because of large file
half_length = len(dictionary) // 2
first_half = {key: dictionary[key] for key in list(dictionary.keys())[:half_length]}
second_half = {key: dictionary[key] for key in list(dictionary.keys())[half_length:]}

# Write the first half to the first file
with open('first_half.json', "w", encoding="utf-8") as first_file:
    json.dump(first_half, first_file, indent=4)

# Write the second half to the second file
with open('second_half.json', "w", encoding="utf-8") as second_file:
    json.dump(second_half, second_file, indent=4)

In [19]:
import pickle
db = {}
db['dictionary_12k'] = dictionary

# Its important to use binary mode
dbfile = open('dictionary_12k', 'ab')

# source, destination
pickle.dump(db, dbfile)                     
dbfile.close()

# Query Processing

In [20]:
import math
def calculate_tf_idf(f_td, N, n_t):
    tf = 1 + np.log10(f_td)
    idf = np.log10(N / n_t)
    return tf * idf

In [21]:
def vector_length(vector_dict):
    length = math.sqrt(sum(tf_idf_value['tf_idf'] ** 2 for tf_idf_value in vector_dict.values()))
    return length

In [22]:
def query_scoring(query, total_number_of_docs, dictionary, k, champion_list = False):
    cosine_scores = {}
    jaccard_scores = {}
    query_tokens = preprocessor.simple_preprocess(query)
    query_tokens_count = dict(collections.Counter(query_tokens))    
    print(query_tokens_count)
    query_terms_num = sum(query_tokens_count.values())
    for term in query_tokens_count:
        if term in dictionary:
            if champion_list: 
                term_docs = dictionary[term]['champions_list']
            else:
                term_docs = dictionary[term]['docs']
            w_tq = calculate_tf_idf(query_tokens_count[term], total_number_of_docs, len(term_docs))
            for doc in term_docs:
                w_td = term_docs[doc]['tf_idf']
                #update doc scores for cosines similarity
                if int(doc) in cosine_scores:
                    #update doc scores for cosines similarity
                    cosine_scores[int(doc)] += w_td * w_tq
                    #update doc scores for jaccard similarity
                    jaccard_scores[int(doc)] += 1
                else:
                    cosine_scores[int(doc)] = w_td * w_tq
                    #update doc scores for jaccard similarity
                    jaccard_scores[int(doc)] = 1
    # calculate cosine score by dividing by doc vector length
    for doc_number in cosine_scores:
        cosine_scores[doc_number] /= vector_length(docs_vectors[str(doc_number)])
    # calculate jaccard score
    for doc_number in jaccard_scores:
        jaccard_scores[doc_number] /= (len(pre_processed_docs[str(doc_number)]['content']) + query_terms_num - jaccard_scores[doc_number])
    # sort scores for top k 
    sorted_doc_cosine = sorted(cosine_scores.items(), key=lambda x:x[1], reverse=True)
    sorted_doc_jaccard = sorted(jaccard_scores.items(), key=lambda x:x[1], reverse=True)
    return sorted_doc_cosine[:k], sorted_doc_jaccard[:k]

In [23]:
def print_results(results):
    dict_result = {}
    for rank, result in enumerate(results):
        doc_id = result[0]
        if doc_id == None:
            continue
        print(100*'-' + '\n')
        print(f'Rank: {rank + 1}')
        print(f'ID: {doc_id}')
        print(f'{docs[f"{doc_id}"]["title"]}')
        print(f'{docs[f"{doc_id}"]["url"]}')
        dict_result[rank + 1] = {'docID': doc_id,
                                 'title': docs[str(doc_id)]["title"],
                                 'url'  : docs[str(doc_id)]["url"]}
    return dict_result
        
def query_search(query, result_numbers = 5, champion_list = False):
    results1, results2 = query_scoring(query, len(docs), dictionary, result_numbers, champion_list)
    # print(results1)
    if len(results1) == 0 and len(results2) == 0:
        print("نتیجه ای یافت نشد")
    else:
        print("Cosine Scores:")
        print(100*'=')
        r1 = print_results(results1)
        print()
        print("Jaccard Scores:")
        print(100*'=')
        r2 = print_results(results2)
    return r1, r2

In [24]:
r1, r2 = query_search('کمیسیون اجتهاد', result_numbers = 5, champion_list = True)

{'کمیسیون': 1, 'اجتهاد': 1}
Cosine Scores:
----------------------------------------------------------------------------------------------------

Rank: 1
ID: 12141
«ماموستا عبدالسلام کریمی» مشاور رئیس جمهور در امور اقوام و اقلیت‌های دینی و مذهبی شد
https://www.farsnews.ir/news/14000725000846/ماموستا-عبدالسلام-کریمی-مشاور-رئیس-جمهور-در-امور-اقوام-و-اقلیت‌های
----------------------------------------------------------------------------------------------------

Rank: 2
ID: 9816
تحقیر زن در اندیشه غرب
https://www.farsnews.ir/news/14000923000557/تحقیر-زن-در-اندیشه-غرب
----------------------------------------------------------------------------------------------------

Rank: 3
ID: 12198
نقدی بر یادداشت «مرزبندی گفتمانی با طالبان»/ وارونه‌نمایی گفتمانی اصلاح‌طلبان
https://www.farsnews.ir/news/14000724000611/نقدی-بر-یادداشت-مرزبندی-گفتمانی-با-طالبان-وارونه‌نمایی-گفتمانی
----------------------------------------------------------------------------------------------------

Rank: 4
ID: 11935
تفکر نه