#Instaling essential libraries

In [None]:
!pip install hazm

#Importing libraries

In [2]:
import hazm
import nltk
import pandas as pd

ModuleNotFoundError: No module named 'hazm'

#English Dataset

In [None]:
!wget http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
!tar -xf MovieSummaries.tar.gz

--2023-11-09 18:16:57--  http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 48002242 (46M) [application/x-gzip]
Saving to: ‘MovieSummaries.tar.gz’


2023-11-09 18:17:13 (2.96 MB/s) - ‘MovieSummaries.tar.gz’ saved [48002242/48002242]



In [None]:
df2 = pd.read_csv("/content/MovieSummaries/plot_summaries.txt", delimiter = "\t",names=["id","text"])
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      42303 non-null  int64 
 1   text    42303 non-null  object
dtypes: int64(1), object(1)
memory usage: 661.1+ KB


In [None]:
df2.head()

Unnamed: 0,id,text
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [None]:
df2.iloc[333:343]

Unnamed: 0,id,text
333,3182458,The stories follow a black formal tailcoat as ...
334,14901197,"Crossing the plains, a wagon train comes acros..."
335,474750,A new heroine has arrived in Gotham whose iden...
336,32777283,"Justin Bayard, a Northern Territory policeman,..."
337,6002416,The central character is the Post Master who i...
338,3322205,Pooja Dharamchand is the daughter of a rich M...
339,3006994,The film features two anonymous Scottish-accen...
340,9537791,"Sam, a college student in a small Northwestern..."
341,23910715,"She's in for a wild ride, when Jean Madison , ..."
342,45772,"On November 25, 1975, Rocky Balboa is introd..."


**import essential libraries**

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

first we look for null values in english dataset

In [None]:
print(df2.isnull().sum())

id      0
text    0
dtype: int64


**step 1: tokenizing words**

then we tokenize the `text` column by words

In [None]:
df2['text'] = df2['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df2['tokenized_text'] = df2['text'].apply(lambda x: word_tokenize(x))

**step 2: normalizing tokens**

after tokenizing, we normalize the `tokenized_text` column. This includes lowecasing the tokens.

In [None]:
df2['normalized_tokens'] = df2['tokenized_text'].apply(lambda x: [token.lower() for token in x])

**step 3: stemming normalized tokens**

In [None]:
def stem_normalized_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

df2['stemmed_tokens'] = df2['normalized_tokens'].apply(stem_normalized_tokens)

In [None]:
df2.loc[0, 'stemmed_tokens']

['shlykov',
 'a',
 'hardwork',
 'taxi',
 'driver',
 'and',
 'lyosha',
 'a',
 'saxophonist',
 'develop',
 'a',
 'bizarr',
 'loveh',
 'relationship',
 'and',
 'despit',
 'their',
 'prejudic',
 'realiz',
 'they',
 'arent',
 'so',
 'differ',
 'after',
 'all']

**step 4: removing stopwords**

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(tokens):
    return [token for token in tokens if token not in stop_words and token.isalpha()]

df2['final_tokens'] = df2['stemmed_tokens'].apply(remove_stop_words)

In [None]:
df2.loc[0, 'final_tokens']

['shlykov',
 'hardwork',
 'taxi',
 'driver',
 'lyosha',
 'saxophonist',
 'develop',
 'bizarr',
 'loveh',
 'relationship',
 'despit',
 'prejudic',
 'realiz',
 'arent',
 'differ']

**building the inverted index**

In [None]:
inverted_index = {}

for index, row in df2.head(100).iterrows():
    document_id = index
    stemmed_tokens = row['stemmed_tokens']

    term_frequency = {}
    for term in stemmed_tokens:
        term_frequency[term] = term_frequency.get(term, 0) + 1

    for term, frequency in term_frequency.items():
        if term in inverted_index:
            inverted_index[term].append((frequency, document_id))
            inverted_index[term] = sorted(inverted_index[term], key=lambda x: x[0], reverse=True)

        else:
            inverted_index[term] = [(frequency, document_id)]



In [None]:
inverted_index

{'shlykov': [(1, 0)],
 'a': [(87, 98),
  (33, 63),
  (31, 3),
  (30, 72),
  (28, 14),
  (26, 1),
  (26, 26),
  (25, 8),
  (24, 40),
  (21, 34),
  (21, 81),
  (21, 93),
  (20, 6),
  (20, 13),
  (20, 42),
  (20, 74),
  (19, 59),
  (19, 84),
  (19, 90),
  (19, 95),
  (18, 75),
  (17, 54),
  (17, 76),
  (17, 77),
  (16, 41),
  (15, 18),
  (15, 70),
  (14, 7),
  (14, 24),
  (14, 35),
  (14, 96),
  (13, 37),
  (13, 82),
  (13, 94),
  (12, 4),
  (12, 5),
  (12, 21),
  (11, 51),
  (11, 86),
  (11, 88),
  (10, 36),
  (10, 64),
  (10, 65),
  (10, 68),
  (9, 28),
  (9, 46),
  (9, 60),
  (9, 97),
  (8, 17),
  (8, 57),
  (8, 66),
  (8, 69),
  (7, 2),
  (7, 25),
  (7, 33),
  (7, 49),
  (6, 9),
  (6, 29),
  (6, 30),
  (6, 47),
  (6, 58),
  (6, 71),
  (6, 89),
  (5, 11),
  (5, 32),
  (5, 44),
  (5, 80),
  (5, 83),
  (5, 87),
  (4, 10),
  (4, 20),
  (4, 48),
  (3, 0),
  (3, 16),
  (3, 19),
  (3, 38),
  (3, 45),
  (3, 52),
  (3, 92),
  (2, 15),
  (2, 31),
  (2, 53),
  (2, 55),
  (2, 56),
  (2, 67),
  (1

**building the term occurence matrix (binary coded)**

In [None]:
term_occurrences = {}

unique_doc_ids = []
for postings in inverted_index.values():
    for _, doc_id in postings:
        if doc_id not in unique_doc_ids:
            unique_doc_ids.append(doc_id)

for term, postings in inverted_index.items():
    term_occurrence_list = [1 if doc_id in [doc_id for _, doc_id in postings] else 0 for doc_id in unique_doc_ids]
    term_occurrences[term] = term_occurrence_list


**boolean IR**

In [None]:
import re

query = input("please enter your query: ")

substrings = re.split(r'([+.])', query)

terms = []
terms_bin = []
expressions = []
binary_code = ""

for sub in substrings:
    # terms
    if sub and sub not in ['+', '.']:
        if sub.startswith('!'):
            binary_code = term_occurrences[sub[1:]]
            binary_code = [1 if bit == 0 else 0 for bit in binary_code]

        else:
            binary_code = term_occurrences[sub]

        terms_bin.append(binary_code)
        terms.append(sub)

    # expressions
    elif sub:
        expressions.append(sub)

for i, exp in enumerate(expressions):
    if exp == '+':
        result_or = [bit1 | bit2 for bit1, bit2 in zip(terms_bin[i], terms_bin[i+1])]
        terms_bin[i+1] = result_or
    elif exp == '.':
        result_and = [bit1 & bit2 for bit1, bit2 in zip(terms_bin[i], terms_bin[i+1])]
        terms_bin[i+1] = result_and

result = terms_bin[-1]
documents = [index  for index, value in enumerate(result) if value == 1]

print(documents)


please enter your query: six
[53]


#Persian Dataset

In [3]:
!wget https://github.com/mohamad-dehghani/persian-pdf-books-dataset/raw/master/final_books.xlsx

--2023-11-10 07:36:39--  https://github.com/mohamad-dehghani/persian-pdf-books-dataset/raw/master/final_books.xlsx
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/mohamad-dehghani/persian-pdf-books-dataset/master/final_books.xlsx [following]
--2023-11-10 07:36:39--  https://raw.githubusercontent.com/mohamad-dehghani/persian-pdf-books-dataset/master/final_books.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1380625 (1.3M) [application/octet-stream]
Saving to: ‘final_books.xlsx’


2023-11-10 07:36:40 (31.1 MB/s) - ‘final_books.xlsx’ saved [1380625/1380625]



In [85]:
farsi_df = pd.read_excel('/content/final_books.xlsx')
farsi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2824 entries, 0 to 2823
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     2824 non-null   object
 1   date      2824 non-null   object
 2   content   2441 non-null   object
 3   category  2824 non-null   object
 4   author    2824 non-null   object
 5   comments  2824 non-null   object
dtypes: object(6)
memory usage: 132.5+ KB


In [67]:
farsi_df.head()

Unnamed: 0,title,date,content,category,author,comments
0,,۲ دی ۱۳۹۸,,,,بدون دیدگاه
1,,۱ دی ۱۳۹۸,,,,بدون دیدگاه
2,,۲۹ آذر ۱۳۹۸,,,,بدون دیدگاه
3,,۲۸ آذر ۱۳۹۸,,,,بدون دیدگاه
4,,۲۶ آذر ۱۳۹۸,,,,بدون دیدگاه


In [23]:
farsi_df.iloc[440:480]

Unnamed: 0,title,date,content,category,author,comments
440,صوتی هاشمی بدون روتوش,۱۸ بهمن ۱۳۹۶,کتاب صوتی «هاشمی بدون روتوش» مجموعه گفت‌و‌گوها...,دسته‌بندی نشده,صادق زیباکلام، فرشته‌السادات اتفاق‌فر,۱ دیدگاه
441,تاریخ زندگی اقتصادی روستاییان و طبقات اجتماعی...,۱۷ بهمن ۱۳۹۶,موضوع این کتاب چنانکه از نام آن برمی آید دربا...,تاریخ ایران,غلامرضا انصافپور,۱ دیدگاه
442,کاهش وزن سریع با ۳۰ تکنیک طلایی,۱۵ بهمن ۱۳۹۶,با کتاب کاهش وزن سریع با ۳۰ تکنیک طلایی نوشته...,پزشکی و سلامت,ابراهیم رجب زاده,۲ دیدگاه
443,صوتی روانشناسی زنان,۱۵ بهمن ۱۳۹۶,کتاب صوتی «روانشناسی زنان» (Feminine Psycholog...,روانشناسی,کارن هورنای,بدون دیدگاه
444,ایران در عهد باستان,۱۳ بهمن ۱۳۹۶,ایران در عهد باستان: در تاریخ اقوام و پادشاها...,تاریخ ایران,محمدجواد مشکور,بدون دیدگاه
445,صوتی ناکامی چپ در ایران : شورشیان آرمانخواه,۱۲ بهمن ۱۳۹۶,کتاب صوتی «ناکامی چپ در ایران : شورشیان آرمان...,دسته‌بندی نشده,مازیار بهروز,بدون دیدگاه
446,آن زندگی را انتخاب کنید که می خواهید,۱۰ بهمن ۱۳۹۶,اواسط ژانویه بود. در حیاط دانشگاه هاروارد به ...,روانشناسی,تال بن شاهار,بدون دیدگاه
447,درباره‌ی زمان,۹ بهمن ۱۳۹۶,مفهوم زمان چندان آشنا و نزدیک است، که نامفهوم...,فلسفه,شروین وکیلی,بدون دیدگاه
448,صوتی شهریار,۸ بهمن ۱۳۹۶,کتاب صوتی «شهریار» اثر نیکولو ماکیاولی (۱۵۲۷-۱...,دسته‌بندی نشده,نیکولو ماکیاولی,۲ دیدگاه
449,PHP به زبان ساده,۸ بهمن ۱۳۹۶,در این کتاب سعی شده است که مطالب به زبان و کده...,برنامه نویسی,یونس ابراهیمی,بدون دیدگاه


**import essential libraries**

In [101]:
from hazm import word_tokenize
from hazm import Normalizer
from hazm import Stemmer
from hazm import stopwords_list
import re

first we look for null values in english dataset

In [88]:
print(farsi_df.isnull().sum())

title         0
date          0
content     383
category      0
author        0
comments      0
all_info      0
dtype: int64


looks like 383 columns of `content` is missing. There are 2 options: we can delete the rows or just leave them blank. The first option is not efficeint since we are removing so much information. In next steps, we will us titles instead of content.

**remove ponctuations**

In [102]:
def remove_punctuation(text):
    if pd.notna(text):
        return re.sub(r'[^\w\s]|[.،؛]', '', text)
    else:
        return text

# df['content'] = df['content'].str.replace('_', '\u200c')
farsi_df['title'] = farsi_df['title'].apply(remove_punctuation)
farsi_df['content'] = farsi_df['content'].apply(remove_punctuation)
farsi_df['category'] = farsi_df['category'].apply(remove_punctuation)
farsi_df['author'] = farsi_df['author'].apply(remove_punctuation)


In [103]:
farsi_df['all_info'] = (
    farsi_df['title'].fillna('') +
    ' ' +
    farsi_df['content'].fillna('') +
    ' ' +
    farsi_df['category'].fillna('') +
    ' ' +
    farsi_df['author'].fillna('')
)

In [104]:
farsi_df.head(1)

Unnamed: 0,title,date,content,category,author,comments,all_info,tokens,normalized_tokens,stemmed_tokens
0,بهترین درسهای زندگی,۲ دی ۱۳۹۸,تی دی جیکس یک کشیش نویسنده سخنران آمریکایی اس...,روانشناسی,سارا رزولت,بدون دیدگاه,بهترین درسهای زندگی تی دی جیکس یک کشیش نویسن...,"[بهترین, درسهای, زندگی, تی, دی, جیکس, یک, کشیش...","[بهترین, درسهای, زندگی, تی, دی, جیکس, یک, کشیش...","[به, درس, زندگ, ت, د, جیکس, یک, کش, نویسنده, س..."


**step 1: tokenizing words**

then we tokenize the `text` column by words

In [105]:
farsi_df['tokens'] = farsi_df['all_info'].apply(lambda x: word_tokenize(x))

In [None]:
farsi_df.head(1)

**step 2: normalizing tokens**

after tokenizing, we normalize the `tokenized_text` column. This includes lowecasing the tokens.

In [106]:
normalizer = Normalizer()
normalize_tokens = lambda tokens: [normalizer.normalize(token) for token in tokens]
farsi_df['normalized_tokens'] = farsi_df['tokens'].apply(normalize_tokens)

In [None]:
farsi_df.head(1)

**step 3: stemming normalized tokens**

In [None]:
farsi_df.loc[30, 'normalized_tokens']

In [97]:
stemmer = Stemmer()

farsi_df['stemmed_tokens'] = farsi_df['normalized_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

**step 4: removing stopwords**

In [120]:
persian_stopwords = set(stopwords_list())
custom_stopwords = ['از', 'به', 'با', 'در', 'برای', 'بر', 'آن', 'او', 'این', 'ها', 'بله', 'آره', 'نه', 'اون', 'که', 'بدون', 'هر', 'شاید', 'باید', 'خیلی']

farsi_df['final_tokens'] = farsi_df['stemmed_tokens'].apply(lambda tokens: [token for token in tokens if token not in persian_stopwords])
farsi_df['final_tokens'] = farsi_df['final_tokens'].apply(lambda tokens: [token for token in tokens if token not in custom_stopwords])

In [None]:
farsi_df.loc[0, 'final_tokens']

**building the inverted index**

In [123]:
inverted_index = {}

for index, row in farsi_df.head(100).iterrows():
    document_id = index
    stemmed_tokens = row['final_tokens']

    term_frequency = {}
    for term in stemmed_tokens:
        term_frequency[term] = term_frequency.get(term, 0) + 1

    for term, frequency in term_frequency.items():
        if term in inverted_index:
            inverted_index[term].append((frequency, document_id))
            inverted_index[term] = sorted(inverted_index[term], key=lambda x: x[0], reverse=True)

        else:
            inverted_index[term] = [(frequency, document_id)]

In [None]:
inverted_index

**building the term occurence matrix (binary coded)**

In [125]:
term_occurrences = {}

unique_doc_ids = []
for postings in inverted_index.values():
    for _, doc_id in postings:
        if doc_id not in unique_doc_ids:
            unique_doc_ids.append(doc_id)

for term, postings in inverted_index.items():
    term_occurrence_list = [1 if doc_id in [doc_id for _, doc_id in postings] else 0 for doc_id in unique_doc_ids]
    term_occurrences[term] = term_occurrence_list


**boolean IR**

In [126]:
import re

query = input("please enter your query: ")

substrings = re.split(r'([+.])', query)

terms = []
terms_bin = []
expressions = []
binary_code = ""

for sub in substrings:
    # terms
    if sub and sub not in ['+', '.']:
        if sub.startswith('!'):
            binary_code = term_occurrences[sub[1:]]
            binary_code = [1 if bit == 0 else 0 for bit in binary_code]

        else:
            binary_code = term_occurrences[sub]

        terms_bin.append(binary_code)
        terms.append(sub)

    # expressions
    elif sub:
        expressions.append(sub)

for i, exp in enumerate(expressions):
    if exp == '+':
        result_or = [bit1 | bit2 for bit1, bit2 in zip(terms_bin[i], terms_bin[i+1])]
        terms_bin[i+1] = result_or
    elif exp == '.':
        result_and = [bit1 & bit2 for bit1, bit2 in zip(terms_bin[i], terms_bin[i+1])]
        terms_bin[i+1] = result_and

result = terms_bin[-1]
documents = [index  for index, value in enumerate(result) if value == 1]

print(documents)


please enter your query: زندگ
[1, 3, 4, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
