In [1]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.5


In [2]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
from gensim.models import Word2Vec

def train_word2vec_model(tokens):
    model = Word2Vec([tokens], vector_size=100, window=5, min_count=1, workers=4)
    return model


In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_sentence_embedding(sentence, model):
    words = sentence.split()
    embeddings = [model.wv[word] for word in words if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

def answer_query(query, text, model):
    sentences = text.split('.')
    query_embedding = get_sentence_embedding(query, model)

    best_score = -1
    best_sentence = ""

    for sentence in sentences:
        sentence_embedding = get_sentence_embedding(sentence, model)
        score = cosine_similarity([query_embedding], [sentence_embedding])[0][0]
        if score > best_score:
            best_score = score
            best_sentence = sentence

    return best_sentence


In [21]:
def main(pdf_path, query):
    text = extract_text_from_pdf(pdf_path)
    tokens = preprocess_text(text)
    model = train_word2vec_model(tokens)
    answer = answer_query(query, text, model)
    return answer

# Example usage

#pdf_path = 'enter pdf file path'
#query = "give your question"
#answer is printed

pdf_path = '/content/Untitled document.pdf'
query = "What is the primary goal of the document?"
answer = main(pdf_path, query)
print("Answer:", answer)

query = "what is architecture used"
answer = main(pdf_path, query)
print("Answer:", answer)

Answer:  The primary goal is to train the 
model on a dataset of noisy and clean images, evaluate its 
performance using metrics such as Mean Squared Error (MSE), Peak 
Signal-to-Noise Ratio (PSNR), , and produce denoised images for a 
test set
Answer: 
Architecture Used
The architecture used in this project is a simple yet effective CNN 
designed specifically for denoising tasks
