#Instaling essential libraries

In [1]:
!pip install hazm



#Importing libraries

In [2]:
import hazm
import nltk
import pandas as pd
import re
from google.colab import drive
import glob
import os
import numpy as np
import string

# Mutual Functions

**inverted index**

In [3]:
from collections import defaultdict

def build_inverted_index(df, column_name):
    inverted_index = defaultdict(list)

    for index, row in df.head(1001).iterrows():
        document_id = index
        stemmed_tokens = row[column_name]

        term_frequency = defaultdict(int)
        for term in stemmed_tokens:
            term_frequency[term] += 1

        for term, frequency in term_frequency.items():
            inverted_index[term].append((frequency, document_id))

    for term, postings in inverted_index.items():
        inverted_index[term] = sorted(postings, key=lambda x: x[0], reverse=True)

    return inverted_index


#English Dataset

**import essential libraries**

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pickle
import os

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

**step 1: tokenizing words**

then we tokenize the `text` column by words

In [None]:
def tokenize_text(text):
    text_no_punct = text.translate(str.maketrans('', '', string.punctuation))
    tokenized_text = word_tokenize(text_no_punct)
    return tokenized_text

**step 2: normalizing tokens**

after tokenizing, we normalize the `tokenized_text` column. This includes lowecasing the tokens.

In [None]:
def normalize_text(text):
    normalized_tokens = [token.lower() for token in text]
    return normalized_tokens

**step 3: stemming normalized tokens**

In [None]:
def stem_normalized_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

Conneting Drive to get the English dataset

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Using the shared dataset to extract 28 novels and make a dataframe out of them

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/IR/Novels/'
file_list = glob.glob(file_path + "*")

novels_df = pd.DataFrame(columns=["novel_name", "content"])
novels_df

Unnamed: 0,novel_name,content


Read the text files and extract the content from them

In [None]:
def read_document(file_name):
    with open(file_name) as file:
        file_name_with_extension = os.path.basename(file_name)
        file_name, _ = os.path.splitext(file_name_with_extension)
        lines = file.read()
    return file_name, lines

Function to make final tokens step by step also get the total amount of tokens for each rows final tokens

In [None]:
def process_all_novels(df):
    df['tokenized_text'] = df['content'].apply(tokenize_text)
    df['normalized_text'] = df['tokenized_text'].apply(normalize_text)
    df['stemmed_tokens'] = df['normalized_text'].apply(stem_normalized_tokens)
    df['final_tokens'] = df['stemmed_tokens'].apply(lambda tokens: [token for token in tokens if token and len(token) > 2])
    df['num_tokens'] = df['final_tokens'].apply(len)

Function to put the content from text files into the dataframe

In [None]:
def getting_input(df):
  for i in range(len(file_list)):
      file_name, content = read_document(file_list[i])
      df.loc[i, "novel_name"] = file_name
      df.loc[i, "content"] = content

Function to add queries to the end of our dataframe so any processing would be applied to them as well

In [None]:
def adding_query(content, df):
  df.loc[len(file_list), "novel_name"] = "query"
  df.loc[len(file_list), "content"] = content

Function that when called, it'll make the novels dataframe, add the query to it and get TF dataframe, IDF dataframe and by them finally get the TF-IDF dataframe and returns it

In [None]:
def tf_idf_df_making(query,df):
  getting_input(df)
  adding_query(query, df)
  process_all_novels(df)
  inverted_index_novels = build_inverted_index(df, 'final_tokens')
  tf_df = tf_making(inverted_index_novels, df)
  idf_df = idf_making(inverted_index_novels, df)
  tf_idf_df = tf_idf_making(inverted_index_novels, novels_df, tf_df, idf_df)
  return tf_idf_df

Function that takes the inverted index and document dataframe and create the TF dataframe

In [None]:
def tf_making(inverted_index_novels, novels_df):
  # Create an empty DataFrame to store TF values
  tf_df = pd.DataFrame(index=inverted_index_novels.keys(), columns=novels_df['novel_name'])

  # Populate the DataFrame with TF values
  for key, values in inverted_index_novels.items():
      for novel_name in novels_df['novel_name']:
          document_num = novels_df.index[novels_df['novel_name'] == novel_name][0]
          tf_value = 0
          for value in values:
              if value[1] == document_num:
                  tf_value = value[0] / novels_df['num_tokens'][document_num]
                  break  # Exit the loop once the TF value is found for the document
          tf_df.at[key, novel_name] = tf_value

  return tf_df

Function that takes the inverted index and document dataframe and create the IDF dataframe

In [None]:
def idf_making(inverted_index_novels, novels_df):
  # Create an empty DataFrame to store IDF values
  idf_df = pd.DataFrame(index=inverted_index_novels.keys(), columns=[0])

  # Calculate the total number of documents
  total_documents = len(novels_df)
  # Populate the DataFrame with IDF values
  for key, values in inverted_index_novels.items():
      df_value = len(values)  # Document Frequency (DF) for the term

      # Calculate IDF using the formula: IDF = log(total_documents / (1 + DF))
      idf_value = np.log10(total_documents / (df_value))
      idf_df.at[key, 0] = idf_value
  return idf_df

Function that takes the inverted index, document dataframe, TF dataframe, IDF dataframe and create the TF-IDF dataframe

In [None]:
def tf_idf_making(inverted_index_novels, novels_df, tf_df, idf_df):
  # Create an empty DataFrame to store TF-IDF values
  tf_idf_df = pd.DataFrame(index=novels_df['novel_name'], columns=inverted_index_novels.keys())

  # Populate the DataFrame with TF-IDF values
  for key in inverted_index_novels.keys():
      for novel_name in novels_df['novel_name']:
          # Retrieve TF and IDF values from precomputed DataFrames
          tf_value = tf_df.at[key, novel_name]
          idf_value = idf_df.at[key, 0]

          # Calculate TF-IDF value
          tf_idf_value = tf_value * idf_value

          # Assign the TF-IDF value to the DataFrame
          tf_idf_df.at[novel_name, key] = tf_idf_value
  return tf_idf_df

Function to get the Cosine similarity for the query and documents then returns the 10 most similar documents

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
def get_cos_sim(df, tf_idf_df):
  cos_sim_matrix = cosine_similarity(tf_idf_df)
  cos_sim_df = pd.DataFrame(cos_sim_matrix, index=tf_idf_df.index, columns=tf_idf_df.index)
  query_cosine_similarity = cos_sim_df["query"]
  top_similarities = query_cosine_similarity.nlargest(12)
  top_similarities = top_similarities[1:11]
  return top_similarities

Function to calculate the jaccard score

In [None]:
# Calculate Jaccard similarity
def jaccard_similarity(doc1, doc2):
    intersection = sum((min(doc1[i], doc2[i]) for i in range(len(doc1))))
    union = sum((max(doc1[i], doc2[i]) for i in range(len(doc1))))
    return intersection / union

Function to get the Jaccard similarity for the query and documents then returns the 10 most similar documents

In [None]:
def get_jac_sim(df, tf_idf_df):
  # Create an empty DataFrame to store Jaccard similarities
  jaccard_similarity_df = pd.DataFrame(index=tf_idf_df.index, columns=tf_idf_df.index)

  # Calculate and fill in Jaccard similarities
  for doc1 in tf_idf_df.index:
      for doc2 in tf_idf_df.index:
          if doc1 != doc2:
              jaccard_similarity_df.loc[doc1, doc2] = jaccard_similarity(tf_idf_df.loc[doc1], tf_idf_df.loc[doc2])
  jaccard_similarity_df = jaccard_similarity_df.fillna(1)
  query_jaccard_similarity = jaccard_similarity_df["query"]
  top_similarities = query_jaccard_similarity.nlargest(12)
  top_similarities = top_similarities[1:11]
  return top_similarities

Final cell for eng dataset that gets the query from the user and print 10 most similar documents based on cosine and jaccard

In [None]:
# Get the query from the user
user_query = input("Enter the query: ")
tf_idf_df = tf_idf_df_making(user_query,novels_df)
top_cos_similarities = get_cos_sim(novels_df, tf_idf_df)
print("Top documents based on Cosine similarities are:")
print(top_cos_similarities)
top_jac_similarities = get_jac_sim(novels_df, tf_idf_df)
print("Top documents based on Jaccard similarities are:")
print(top_jac_similarities)

Enter the query: Hello there im farid
Top documents based on Cosine similarities are:
novel_name
TheSunAlsoRises                 0.022679
AlicesAdventuresInWonderland    0.000000
TheSportOfTheGods               0.000000
HardTimes                       0.000000
TheAgeOfInnocence               0.000000
TheManWhoWasThursday            0.000000
AnneOfGreenGables               0.000000
APassageToIndia                 0.000000
Emma                            0.000000
AHandfulOfDust                  0.000000
Name: query, dtype: float64
Top documents based on Jaccard similarities are:
novel_name
TheSunAlsoRises                 0.000615
AlicesAdventuresInWonderland    0.000000
TheSportOfTheGods               0.000000
HardTimes                       0.000000
TheAgeOfInnocence               0.000000
TheManWhoWasThursday            0.000000
AnneOfGreenGables               0.000000
APassageToIndia                 0.000000
Emma                            0.000000
AHandfulOfDust                  0.00

#Persian Dataset

**import essential libraries**

In [57]:
from hazm import word_tokenize
from hazm import Normalizer
from hazm import Stemmer
from hazm import stopwords_list
import re

Download Dataset

In [58]:
!wget https://github.com/mohamad-dehghani/persian-pdf-books-dataset/raw/master/final_books.xlsx

--2023-12-01 18:19:36--  https://github.com/mohamad-dehghani/persian-pdf-books-dataset/raw/master/final_books.xlsx
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/mohamad-dehghani/persian-pdf-books-dataset/master/final_books.xlsx [following]
--2023-12-01 18:19:36--  https://raw.githubusercontent.com/mohamad-dehghani/persian-pdf-books-dataset/master/final_books.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1380625 (1.3M) [application/octet-stream]
Saving to: ‘final_books.xlsx.3’


2023-12-01 18:19:37 (36.7 MB/s) - ‘final_books.xlsx.3’ saved [1380625/1380625]



Get only the first records of dataset

In [59]:
farsi_df = pd.read_excel('/content/final_books.xlsx')
farsi_df = farsi_df.iloc[:800, :]

Function to add queries to the end of our dataframe so any processing would be applied to them as well

In [60]:
def adding_query(content, df):
  num_rows = len(df)
  df.loc[num_rows, "title"] = "q"
  df.loc[num_rows, "content"] = content
  df.loc[num_rows, "date"] = None
  df.loc[num_rows, "category"] = None
  df.loc[num_rows, "author"] = None
  df.loc[num_rows, "comments"] = None

Function that remove punctuations

In [61]:
def remove_punctuation(text):
    if pd.notna(text):
        return re.sub(r'[^\w\s]|[.،؛]', '', text)
    else:
        return text

Function to make final tokens step by step also get the total amount of tokens for each rows final tokens

In [62]:
def process_all_books(df):
  df['title'] = df['title'].apply(remove_punctuation)
  df['content'] = df['content'].apply(remove_punctuation)
  df['category'] = df['category'].apply(remove_punctuation)
  df['author'] = df['author'].apply(remove_punctuation)

  df['all_info'] = (
      df['title'].fillna('') +
      ' ' +
      df['content'].fillna('') +
      ' ' +
      df['category'].fillna('') +
      ' ' +
      df['author'].fillna('')
  )
  df['tokens'] = df['all_info'].apply(lambda x: word_tokenize(x))
  normalizer = Normalizer()
  normalize_tokens = lambda tokens: [normalizer.normalize(token) for token in tokens]
  df['normalized_tokens'] = df['tokens'].apply(normalize_tokens)

  stemmer = Stemmer()
  df['stemmed_tokens'] = df['normalized_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

  df['final_tokens'] = df['stemmed_tokens'].apply(lambda tokens: [token for token in tokens if token and len(token) > 2])

  df['num_tokens'] = df['final_tokens'].apply(len)

Function that takes the inverted index and document dataframe and create the TF dataframe

In [63]:
def tf_making(inverted_index_novels, farsi_df):
    tf_values = np.zeros((len(inverted_index_novels), len(farsi_df)))

    for i, (term, postings) in enumerate(inverted_index_novels.items()):
        for frequency, document_id in postings:
            tf_values[i, document_id] = frequency / farsi_df['num_tokens'][document_id]

    tf_df = pd.DataFrame(tf_values, index=inverted_index_novels.keys(), columns=farsi_df['title'])
    return tf_df

Function that takes the inverted index and document dataframe and create the IDF dataframe

In [64]:
# IDF Making
def idf_making(inverted_index_novels, farsi_df):
    total_documents = len(farsi_df)
    df_values = np.array([len(postings) for postings in inverted_index_novels.values()])
    idf_values = np.log10(total_documents / (1 + df_values))

    idf_df = pd.DataFrame(idf_values, index=inverted_index_novels.keys(), columns=[0])
    return idf_df

Function that takes the inverted index, document dataframe, TF dataframe, IDF dataframe and create the TF-IDF dataframe

In [65]:
# TF-IDF Making
def tf_idf_making(inverted_index_novels, farsi_df, tf_df, idf_df):
    # Convert TF DataFrame to sparse matrix
    tf_matrix = csr_matrix(tf_df.values)

    # Use sklearn's TfidfTransformer for efficient TF-IDF computation
    transformer = TfidfTransformer(norm=None, smooth_idf=False)
    tf_idf_matrix = transformer.fit_transform(tf_matrix.transpose())

    # Convert the result back to a DataFrame
    tf_idf_df = pd.DataFrame(tf_idf_matrix.toarray(), index=farsi_df['title'], columns=inverted_index_novels.keys())

    return tf_idf_df

Function to get the Cosine similarity for the query and documents then returns the 10 most similar documents

In [66]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
def get_cos_sim(df, tf_idf_df):
  cos_sim_matrix = cosine_similarity(tf_idf_df)
  cos_sim_df = pd.DataFrame(cos_sim_matrix, index=tf_idf_df.index, columns=tf_idf_df.index)
  query_cosine_similarity = cos_sim_df["q"]
  top_similarities = query_cosine_similarity.nlargest(12)
  top_similarities = top_similarities[1:11]
  return top_similarities

Function to calculate the jaccard score

In [67]:
# Calculate Jaccard similarity
def jaccard_similarity(doc1, doc2):
    intersection = sum((min(doc1[i], doc2[i]) for i in range(len(doc1))))
    union = sum((max(doc1[i], doc2[i]) for i in range(len(doc1))))
    return intersection / union if union != 0 else 0

Function to get the Jaccard similarity for the query and documents then returns the 10 most similar documents

In [68]:
def get_jac_sim(tf_idf_df):
  # Create an empty DataFrame to store Jaccard similarities
  jaccard_similarity_df = pd.DataFrame(index=tf_idf_df.index, columns=tf_idf_df.index)

  for i in range(len(tf_idf_df)):
    for j in range(i + 1, len(tf_idf_df)):
        doc1 = tf_idf_df.index[i]
        doc2 = tf_idf_df.index[j]
        jaccard_sim = jaccard_similarity(tf_idf_df.loc[doc1], tf_idf_df.loc[doc2])
        jaccard_similarity_df.loc[doc1, doc2] = jaccard_sim
        jaccard_similarity_df.loc[doc2, doc1] = jaccard_sim

  jaccard_similarity_df = jaccard_similarity_df.fillna(1)
  query_jaccard_similarity = jaccard_similarity_df["q"]
  top_similarities = query_jaccard_similarity.nlargest(12)
  top_similarities = top_similarities[1:11]
  return top_similarities

Final cell for eng dataset that gets the query from the user and print 10 most similar documents based on cosine and jaccard

In [None]:
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

# Get the query from the user
user_query = input("Enter the query: ")
adding_query(user_query, farsi_df)
process_all_books(farsi_df)

# Remove unnecessary columns
columns_to_remove = ['content', 'date', 'comments', 'category', 'author', 'all_info', 'tokens', 'normalized_tokens', 'stemmed_tokens']
farsi_df = farsi_df.drop(columns=columns_to_remove, axis=1)

# Build Inverted Index
inverted_index_novels = build_inverted_index(farsi_df, 'final_tokens')

# TF-IDF Making
tf_df = tf_making(inverted_index_novels, farsi_df)
idf_df = idf_making(inverted_index_novels, farsi_df)
tf_idf_df = tf_idf_making(inverted_index_novels, farsi_df, tf_df, idf_df)

top_cos_similarities = get_cos_sim(farsi_df, tf_idf_df)
print("Top documents based on Cosine similarities are:")
print(top_cos_similarities)
top_jac_similarities = get_jac_sim(tf_idf_df)
print("Top documents based on Jaccard similarities are:")
print(top_jac_similarities)

Top documents based on Cosine similarities are:
title
 یک دو سه بینهایت                                                   0.297382
 از فیزیک تا عمل برای نوجوانان                                      0.201805
 تاریخ زندگی اقتصادی روستاییان و طبقات اجتماعی ایران                0.191052
 خلاقیت                                                             0.169625
 مدیریت آداب و معاشرت برخورد با مشتری                               0.156927
 ضدگلوله  راه کارهای کاملا عملی افزایش سریع درآمد در بازار ایران    0.156857
 کارنامه اسلام                                                      0.143761
 سلسله های اسلامی                                                   0.139323
 آسمان پرستاره                                                      0.138884
 اسرار نوش دارو                                                     0.137131
Name: q, dtype: float64
