<a href="https://colab.research.google.com/github/eklahari/IR-Information-Retrieval/blob/main/term_document_incidence_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Term-Document Incidence Matrix**

import necessary libraries do **tokenising** and **liguistic preprocessing**

In [42]:
import nltk
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def preprocess_documents(documents):
    refined_tokens = []

    for doc in documents:
        # Step 1: Lowercasing
        doc = doc.lower()

        # Step 2: Tokenization
        tokens = word_tokenize(doc)

        # Step 3: Removing Punctuation
        tokens = [word for word in tokens if word not in string.punctuation]

        # Step 4: Stopword Removal
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word not in stop_words]

        # Step 5: Stemming (using Porter Stemmer)
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

        # Step 6: Lemmatization (using WordNet Lemmatizer)
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

        # Extend the refined_tokens list with the current document's tokens
        refined_tokens.extend(lemmatized_tokens)

    return refined_tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**construct** term-document incidence matrix

In [43]:
def create_term_document_matrix(documents):
    # Preprocess documents
    refined_tokens = preprocess_documents(documents)

    # Create a list of unique terms (words)
    unique_terms = list(set(refined_tokens))

    # Create an empty term-document incidence matrix
    term_document_matrix = []

    # Initialize a row for each term and a column for each document
    for term in unique_terms:
        row = []
        for doc in documents:
            # Count the frequency of the term in the document
            term_count = doc.split().count(term)
            row.append(term_count)
        term_document_matrix.append(row)

    return unique_terms,term_document_matrix
# Example usage:
documents = [
    "The quick brown fox jumps over the lazy dog's tail.",
    "I am learning about text preprocessing.",
    "Natural language processing is fascinating!"
]
unique_terms,term_document_matrix=create_term_document_matrix(documents)
# Print the term-document incidence matrix neatly
print("Term-Document Incidence Matrix:\n")
# Print header row with document names
print("\t", end="")
for i, doc in enumerate(documents):
    print(f"Doc {i+1}\t", end="")
print()

# Print term rows with counts
for i, term_row in enumerate(term_document_matrix):
    print(unique_terms[i], end="\t")
    for count in term_row:
        print(count, end="\t")
    print()

Term-Document Incidence Matrix:

	Doc 1	Doc 2	Doc 3	
's	0	0	0	
text	0	1	0	
preprocessing	0	0	0	
natural	0	0	0	
dog	0	0	0	
learning	0	1	0	
language	0	0	1	
quick	1	0	0	
lazy	1	0	0	
processing	0	0	1	
tail	0	0	0	
fox	1	0	0	
fascinating	0	0	0	
brown	1	0	0	
jump	0	0	0	


process **boolean queries**

In [44]:
def custom_not(operand):
    return [0 if x == 1 else 1 for x in operand]

def apply_operator(operator, operand1, operand2):
    if operator == 'and':
        return [a & b for a, b in zip(operand1, operand2)]
    elif operator == 'or':
        return [a | b for a, b in zip(operand1, operand2)]

def process_boolean_query(query, term_document_matrix, unique_terms):
    query = query.lower()
    query = query.replace("(", " ( ").replace(")", " ) ").split()

    # Define operator precedence
    precedence = {'not': 3, 'and': 2, 'or': 1}

    # Initialize stacks for operators and operands
    operators = []
    operands = []

    for token in query:
        if token in unique_terms:
            term_index = unique_terms.index(token)
            operands.append(term_document_matrix[term_index])
        elif token == '(':
            operators.append(token)
        elif token == ')':
            while operators and operators[-1] != '(':
                operator = operators.pop()
                if operator == 'not':
                    operand = operands.pop()
                    result = custom_not(operand)
                else:
                    operand2 = operands.pop()
                    operand1 = operands.pop()
                    result = apply_operator(operator, operand1, operand2)

                operands.append(result)

            operators.pop()  # Remove the '('
        elif token in precedence:
            while (operators and operators[-1] in precedence and
                    precedence[operators[-1]] >= precedence[token]):
                operator = operators.pop()
                if operator == 'not':
                    operand = operands.pop()
                    result = custom_not(operand)
                else:
                    operand2 = operands.pop()
                    operand1 = operands.pop()
                    result = apply_operator(operator, operand1, operand2)

                operands.append(result)

            operators.append(token)

    # Process remaining operators and operands
    while operators:
        operator = operators.pop()
        if operator == '(' or operator == ')':
            raise ValueError("Invalid query")
        if operator == 'not':
            operand = operands.pop()
            result = custom_not(operand)
        else:
            operand2 = operands.pop()
            operand1 = operands.pop()
            result = apply_operator(operator, operand1, operand2)

        operands.append(result)

    return operands[0]



# Complex boolean query
#sample query :(quick and brown) or not (fox and not tail)
query=input("enter the query:")
result = process_boolean_query(query, term_document_matrix, unique_terms)

# Print the result of the query
print("\nQuery:", query)
print("Result:", result)


enter the query:(quick and lazy) or not(fox)

Query: (quick and lazy) or not(fox)
Result: [1, 1, 1]


In [None]:
from google.colab import drive
drive.mount('/content/drive')