In [6]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import os

# Specify the path to your folder in Google Drive containing the PDF files
folder_path = "/content/drive/MyDrive/Data"

# Change the current working directory to the specified folder
os.chdir(folder_path)

# List all files in the current directory (optional)
files = os.listdir()
print("Files in the folder:", files)


Files in the folder: ['paper.pdf', 'bert_embeddings.npy']


In [8]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
    return text

# Example usage to load a PDF file from Google Drive
pdf_file = "paper.pdf"  # Replace with your actual PDF file name
pdf_path = os.path.join(folder_path, pdf_file)
pdf_text = extract_text_from_pdf(pdf_path)



In [10]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocesstext(cv_text):
    # Tokenization
    tokens = word_tokenize(cv_text)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text



preprocessed_text = preprocesstext(pdf_text)
print("Preprocessed Text:")
print(preprocessed_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessed Text:
attention is all you need ashish vaswani∗ google brain avaswani @ google.com noam shazeer∗ google brain noam @ google.com niki parmar∗ google research nikip @ google.com jakob uszkoreit∗ google research usz @ google.com llion jones∗ google research llion @ google.com aidan n. gomez∗† university of toronto aidan @ cs.toronto.edu łukasz kaiser∗ google brain lukaszkaiser @ google.com illia polosukhin∗‡ illia.polosukhin @ gmail.com abstract the dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder . the best performing models also connect the encoder and decoder through an attention mechanism . we propose a new simple network architecture , the transformer , based solely on attention mechanisms , dispensing with recurrence and convolutions entirely . experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring sig

In [42]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np

# Load BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def generate_embeddings(document_text, max_chunk_length=512, overlap_length=50):
    """
    """
    # Initialize list to store embeddings
    embeddings = []
    num_chunks = 0

    # Process document using overlapping chunks
    start_idx = 0
    while start_idx < len(document_text):
        # Determine the end of the current chunk
        end_idx = min(start_idx + max_chunk_length, len(document_text))

        # Extract the current chunk
        chunk = document_text[start_idx:end_idx]

        # Tokenize chunk
        tokenized_chunk = tokenizer.encode(chunk, add_special_tokens=True, return_tensors='pt', max_length=max_chunk_length, truncation=True)

        # Generate embeddings
        with torch.no_grad():
            outputs = model(tokenized_chunk)
            last_hidden_states = outputs.last_hidden_state

        # Store embeddings (example: mean pooling)
        chunk_embedding = torch.mean(last_hidden_states, dim=1).squeeze().numpy()
        embeddings.append(chunk_embedding)

        # Move start_idx forward with overlap
        start_idx += max_chunk_length - overlap_length
        num_chunks += 1

    return embeddings, num_chunks

# Example usage
document_text = preprocessed_text
embeddings, num_chunks = generate_embeddings(document_text)
print(f"Number of chunks: {num_chunks}")
print(embeddings)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
       -7.08479807e-02, -1.39166519e-01,  2.52935380e-01, -4.19135481e-01,
       -9.22159180e-02, -1.92216337e-01,  5.60169332e-02, -2.84891929e-02,
        2.34196663e-01, -1.17715113e-01, -1.94041580e-01,  2.33639441e-02,
        6.26116842e-02,  1.80611074e-01,  4.48516995e-01,  5.24293967e-02,
       -2.21456811e-01,  1.80524975e-01, -5.38786292e-01, -1.59063600e-02,
        6.73554599e-01,  4.07294370e-02, -1.27935708e-01, -7.72086903e-02,
        1.33032471e-01,  2.03399375e-01,  3.24244410e-01,  3.23874205e-01,
       -6.54532760e-02,  3.17710042e-02,  3.12441200e-01, -3.72362882e-01,
       -2.59883761e-01,  2.14526094e-02,  1.58134941e-02, -3.18344414e-01,
        5.10759175e-01,  3.89258832e-01, -2.51468509e-01, -3.47014904e-01,
        7.83918053e-02, -8.75483900e-02,  1.08188137e-01,  3.54037359e-02,
        1.29376411e-01,  9.77662131e-02,  6.99836075e-01, -8.90318155e-02,
       -2.04317510e-01,  5.42522907

In [43]:
import faiss
import numpy as np

In [45]:
def store_embeddings_in_faiss(embeddings):
    embeddings = np.array(embeddings, dtype=np.float32)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

In [46]:
index= store_embeddings_in_faiss(embeddings)
faiss.write_index(index, '/content/drive/MyDrive/Data/indices.faiss')

In [52]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
import faiss

# Load BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def generate_embeddings(document_text, max_chunk_length=512, overlap_length=50):
    # Initialize list to store embeddings
    embeddings = []
    num_chunks = 0

    # Process document using overlapping chunks
    start_idx = 0
    while start_idx < len(document_text):
        # Determine the end of the current chunk
        end_idx = min(start_idx + max_chunk_length, len(document_text))

        # Extract the current chunk
        chunk = document_text[start_idx:end_idx]

        # Tokenize chunk
        tokenized_chunk = tokenizer.encode(chunk, add_special_tokens=True, return_tensors='pt', max_length=max_chunk_length, truncation=True)

        # Generate embeddings
        with torch.no_grad():
            outputs = model(tokenized_chunk)
            last_hidden_states = outputs.last_hidden_state

        # Store embeddings (example: mean pooling)
        chunk_embedding = torch.mean(last_hidden_states, dim=1).squeeze().numpy()
        embeddings.append(chunk_embedding)

        # Move start_idx forward with overlap
        start_idx += max_chunk_length - overlap_length
        num_chunks += 1

    return embeddings, num_chunks

# Example usage
document_text = preprocessed_text
embeddings, num_chunks = generate_embeddings(document_text)
print(f"Number of chunks: {num_chunks}")
print(embeddings)

def store_embeddings_in_faiss(embeddings):
    embeddings = np.array(embeddings, dtype=np.float32)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

# Store embeddings in FAISS index
index = store_embeddings_in_faiss(embeddings)
faiss.write_index(index, '/content/drive/MyDrive/Data/indices.faiss')




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
       -7.08479807e-02, -1.39166519e-01,  2.52935380e-01, -4.19135481e-01,
       -9.22159180e-02, -1.92216337e-01,  5.60169332e-02, -2.84891929e-02,
        2.34196663e-01, -1.17715113e-01, -1.94041580e-01,  2.33639441e-02,
        6.26116842e-02,  1.80611074e-01,  4.48516995e-01,  5.24293967e-02,
       -2.21456811e-01,  1.80524975e-01, -5.38786292e-01, -1.59063600e-02,
        6.73554599e-01,  4.07294370e-02, -1.27935708e-01, -7.72086903e-02,
        1.33032471e-01,  2.03399375e-01,  3.24244410e-01,  3.23874205e-01,
       -6.54532760e-02,  3.17710042e-02,  3.12441200e-01, -3.72362882e-01,
       -2.59883761e-01,  2.14526094e-02,  1.58134941e-02, -3.18344414e-01,
        5.10759175e-01,  3.89258832e-01, -2.51468509e-01, -3.47014904e-01,
        7.83918053e-02, -8.75483900e-02,  1.08188137e-01,  3.54037359e-02,
        1.29376411e-01,  9.77662131e-02,  6.99836075e-01, -8.90318155e-02,
       -2.04317510e-01,  5.42522907

In [58]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
import faiss

# Load BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def generate_embeddings(document_text, max_chunk_length=512, overlap_length=50):
    # Initialize list to store embeddings and chunks
    embeddings = []
    chunks = []
    num_chunks = 0

    # Process document using overlapping chunks
    start_idx = 0
    while start_idx < len(document_text):
        # Determine the end of the current chunk
        end_idx = min(start_idx + max_chunk_length, len(document_text))

        # Extract the current chunk
        chunk = document_text[start_idx:end_idx]
        chunks.append(chunk)  # Store chunk

        # Tokenize chunk
        tokenized_chunk = tokenizer.encode(chunk, add_special_tokens=True, return_tensors='pt', max_length=max_chunk_length, truncation=True)

        # Generate embeddings
        with torch.no_grad():
            outputs = model(tokenized_chunk)
            last_hidden_states = outputs.last_hidden_state

        # Store embeddings (example: mean pooling)
        chunk_embedding = torch.mean(last_hidden_states, dim=1).squeeze().numpy()
        embeddings.append(chunk_embedding)

        # Move start_idx forward with overlap
        start_idx += max_chunk_length - overlap_length
        num_chunks += 1

    return embeddings, num_chunks, chunks

# Example usage
document_text = preprocessed_text
embeddings, num_chunks, chunks = generate_embeddings(document_text)
print(f"Number of chunks: {num_chunks}")

def store_embeddings_in_faiss(embeddings):
    embeddings = np.array(embeddings, dtype=np.float32)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

# Store embeddings in FAISS index
index = store_embeddings_in_faiss(embeddings)
faiss.write_index(index, '/content/drive/MyDrive/Data/indices.faiss')

def query_document(query_text, model, tokenizer, index, chunks):
    # Generate embedding for the query text
    query_embedding, _, _ = generate_embeddings(query_text)
    query_embedding = np.array([query_embedding[0]], dtype=np.float32)

    # Search the FAISS index for similar embeddings
    D, I = index.search(query_embedding, k=5)  # Retrieve top 5 similar chunks

    # Retrieve and print the text of the top K similar chunks
    similar_chunks = [chunks[i] for i in I[0]]
    return similar_chunks




Number of chunks: 74
Top K similar chunks:
Chunk 1: lexity of a separable convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer , the approach we take in our model . as side beneﬁt , self-attention could yield more interpretable models . we inspect attention distributions from our models and present and discuss examples in the appendix . not only do individual attention heads clearly learn to perform different tasks , many appear to exhibit behavior related to the syntactic and semantic structure of the sentenc
Chunk 2: ble 3 . in table 3 rows ( a ) , we vary the number of attention heads and the attention key and value dimensions , keeping the amount of computation constant , as described in section 3.2.2 . while single-head attention is 0.9 bleu worse than the best setting , quality also drops off with too many heads . 5we used values of 2.8 , 3.7 , 6.0 and 9.5 tflops for k80 , k40 , m40 and p100 , respectively . 8 table 3 : variations 

In [59]:
# Example query
query_text = "What is multi head attention?"
top_k_similar_chunks = query_document(query_text, model, tokenizer, index, chunks)
print("Top K similar chunks:")
for idx, chunk in enumerate(top_k_similar_chunks, start=1):
    print(f"Chunk {idx}: {chunk}")

Top K similar chunks:
Chunk 1: lexity of a separable convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer , the approach we take in our model . as side beneﬁt , self-attention could yield more interpretable models . we inspect attention distributions from our models and present and discuss examples in the appendix . not only do individual attention heads clearly learn to perform different tasks , many appear to exhibit behavior related to the syntactic and semantic structure of the sentenc
Chunk 2: ble 3 . in table 3 rows ( a ) , we vary the number of attention heads and the attention key and value dimensions , keeping the amount of computation constant , as described in section 3.2.2 . while single-head attention is 0.9 bleu worse than the best setting , quality also drops off with too many heads . 5we used values of 2.8 , 3.7 , 6.0 and 9.5 tflops for k80 , k40 , m40 and p100 , respectively . 8 table 3 : variations on the transformer ar

In [64]:

# Example query
query_text = "What is multi head attention"
top_k_similar_sentences = query_document(query_text, model, tokenizer, index, sentence_list)
print("Top K similar sentences:")
for idx, sentence in enumerate(top_k_similar_sentences, start=1):
    print(f"Sentence {idx}: {sentence}")

Top K similar sentences:
Sentence 1: listing order is random .
Sentence 2: can active memory replace attention ?
Sentence 3: in this work we employ h = 8 parallel attention layers , or heads .
Sentence 4: structured attention networks .
Sentence 5: rethinking the inception architecture for computer vision .


In [113]:
from sentence_transformers import SentenceTransformer, util
import faiss
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize

# Load Sentence Transformers model (BERT-based)
model = SentenceTransformer('bert-base-nli-mean-tokens')

def generate_sentence_embeddings(document_text):
    """
    Generates embeddings for each sentence in the document text using Sentence Transformers.
    Returns a list of embeddings and a corresponding list of sentences.
    """
    # Split document into sentences
    sentences = sent_tokenize(document_text)

    # Generate embeddings for each sentence
    embeddings = model.encode(sentences)

    return embeddings, sentences

# Example usage
document_text = pdf_text
embeddings, sentence_list = generate_sentence_embeddings(document_text)
print("Number of sentences:", len(sentence_list))

def store_embeddings_in_faiss(embeddings):
    """
    Stores the embeddings in a FAISS index.
    """
    embeddings = np.array(embeddings, dtype=np.float32)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    return index

# Store embeddings in FAISS index
index = store_embeddings_in_faiss(embeddings)
faiss.write_index(index, '/content/drive/MyDrive/Data/indices.faiss')

def query_document(query_text, model, index, sentence_list):
    """
    Queries the FAISS index with the query text and returns the top K similar sentences.
    """
    # Generate embedding for the query text
    query_embedding = model.encode(query_text)
    query_embedding = np.array([query_embedding], dtype=np.float32)

    # Search the FAISS index for similar embeddings
    D, I = index.search(query_embedding, k=5)  # Retrieve top 5 similar sentences

    # Retrieve and print the text of the top K similar sentences
    similar_sentences = [sentence_list[i] for i in I[0]]
    return similar_sentences

# Example query





Number of sentences: 285


In [115]:
query_text = "What is multi head attention?"
top_k_similar_sentences = query_document(query_text, model, index, sentence_list)
print("Top K similar sentences:")
for idx, sentence in enumerate(top_k_similar_sentences, start=1):
    print(f"Sentence {idx}: {sentence}")

Top K similar sentences:
Sentence 1: (right) Multi-Head Attention consists of several
attention layers running in parallel.
Sentence 2: Multi-head attention allows the model to jointly attend to information from different representation
subspaces at different positions.
Sentence 3: Structured attention networks.
Sentence 4: A deep reinforced model for abstractive
summarization.
Sentence 5: Can active memory replace attention?


In [110]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocesstext(document_text):
    # Split document into sentences
    sentences = sent_tokenize(document_text)

    # Preprocess each sentence individually
    preprocessed_sentences = []
    for sentence in sentences:
        # Tokenization
        tokens = word_tokenize(sentence)

        # Lowercasing
        tokens = [token.lower() for token in tokens]

        # Remove special characters and digits
        tokens = [re.sub(r'\W|\d', '', token) for token in tokens]

        # Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Join tokens back into a single string
        preprocessed_sentence = ' '.join(tokens)

        # Add preprocessed sentence to list
        preprocessed_sentences.append(preprocessed_sentence)

    return preprocessed_sentences


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [111]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize

# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_sentence_embeddings(document_text):
    """
    Generates embeddings for each sentence in the document text using Sentence-BERT.
    Returns a list of embeddings and a corresponding list of sentences.
    """
    # Split document into sentences
    sentences = sent_tokenize(document_text)

    # Generate embeddings for each sentence
    embeddings = model.encode(sentences)

    return embeddings, sentences

# Example usage
document_text = preprocessed_text
embeddings, sentence_list = generate_sentence_embeddings(document_text)
print("Number of sentences:", len(sentence_list))

def store_embeddings_in_faiss(embeddings):
    """
    Stores the embeddings in a FAISS index.
    """
    embeddings = np.array(embeddings, dtype=np.float32)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    return index

# Store embeddings in FAISS index
index = store_embeddings_in_faiss(embeddings)
faiss.write_index(index, '/content/drive/MyDrive/Data/indices.faiss')

def query_document(query_text, model, index, sentence_list):
    """
    Queries the FAISS index with the query text and returns the top K similar sentences.
    """
    # Generate embedding for the query text
    query_embedding = model.encode(query_text)
    query_embedding = np.array([query_embedding], dtype=np.float32)

    # Search the FAISS index for similar embeddings
    D, I = index.search(query_embedding, k=5)  # Retrieve top 5 similar sentences

    # Retrieve and print the text of the top K similar sentences
    similar_sentences = [sentence_list[i] for i in I[0]]
    i=0
    for sentence in similar_sentences:
        print(i+1, sentence)
        i+=1

    return similar_sentences

# Example query





Number of sentences: 1
