In [1]:

from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
import torch
from langchain.document_loaders import DirectoryLoader




In [2]:
import fitz  # PyMuPDF

# Open the PDF file
with fitz.open('cheeeseeeeeeeeeee.pdf') as doc:
    text = ""
    # Iterate over each page in the PDF
    for page in doc:
        # Extract text from the page
        text += page.get_text()

# Write the extracted text to a text file
with open('data/cheeeseeeeeeeeeee.txt', 'w', encoding='utf-8') as f:
    f.write(text)

In [3]:
# Load the dataset
Data_path = "data"
loader = DirectoryLoader(Data_path)
documents = loader.load()

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=150)
chunks = text_splitter.split_documents(documents)

In [5]:
text_contents = [chunk.page_content for chunk in chunks] 

In [6]:
#replace \n with space
text_contents = [chunk.replace('\n', ' ') for chunk in text_contents]
text_contents

['Fermentology • Book  A Brief History of Cheese  [Essay]  Charles Ludington  Published on: Oct 25, 2021  License: Creative Commons Attribution 4.0 International License (CC-BY 4.0)  Fermentology • Book  A Brief History of Cheese [Essay]  2  Let’s begin with a little bit of history. Cheese is one of the oldest foods we humans have  produced, possibly dating from the beginning of sheep and cattle herding êé,ééé years  ago. That said, the discovery of cheese making was probably accidental. It’s likely that  the curdling action of rennet was discovered when a herdsman poured milk into a sack or  pouch made of an animal’s stomach, and this may have happened independently in',
 'pouch made of an animal’s stomach, and this may have happened independently in  Europe, the Middle East, or Central Asia. The Ancient Sumerians and Egyptians certainly  made cheese, and Homer speaks of cheese in both the Illiad and the Odyssey. The oldest  archaelogical example of solid cheese was found in an Ancien

In [7]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf

# Define the model name and initialize the tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the TensorFlow version of the model
model = TFAutoModel.from_pretrained(model_name, from_pt=True)

# Tokenize and process each chunk
chunk_embeddings = []
for chunk in text_contents:
    
    chunk= str(chunk)
    
    tokenized_chunk = tokenizer(chunk, padding=True, truncation=True, return_tensors="tf", max_length=512)
    outputs = model(**tokenized_chunk)
    # Use pooler_output or mean of the last_hidden_state for embeddings
    embedding = outputs.pooler_output if outputs.pooler_output is not None else tf.reduce_mean(outputs.last_hidden_state, axis=1)
    chunk_embeddings.append(embedding.numpy().flatten())



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [8]:
from annoy import AnnoyIndex

# Assuming all embeddings are of the same dimension
dimension = chunk_embeddings[0].shape[0]

# Initialize the Annoy index with the specified dimension and metric
index = AnnoyIndex(dimension, 'angular')

# Add embeddings to the index
for i, embedding in enumerate(chunk_embeddings):
    index.add_item(i, embedding)

# Build the index with a specified number of trees
index.build(50)

# Save the index
index.save('chunk_embeddings.ann')

# Optional: To verify
print("Number of items in the index:", index.get_n_items())

Number of items in the index: 27


In [9]:
chunk_index_to_text = {i: chunk for i, chunk in enumerate(text_contents)}
print(chunk_index_to_text[0])

Fermentology • Book  A Brief History of Cheese  [Essay]  Charles Ludington  Published on: Oct 25, 2021  License: Creative Commons Attribution 4.0 International License (CC-BY 4.0)  Fermentology • Book  A Brief History of Cheese [Essay]  2  Let’s begin with a little bit of history. Cheese is one of the oldest foods we humans have  produced, possibly dating from the beginning of sheep and cattle herding êé,ééé years  ago. That said, the discovery of cheese making was probably accidental. It’s likely that  the curdling action of rennet was discovered when a herdsman poured milk into a sack or  pouch made of an animal’s stomach, and this may have happened independently in


In [10]:
# Your query text
query_text = ("what protein does pizza have?")

# Tokenize the query
query_input = tokenizer(query_text, padding=True, truncation=True, return_tensors="tf")

# Assuming we're using the same model as before
# If using last_hidden_state, consider applying pooling to match chunk embeddings preparation
query_output = model(**query_input)
query_embedding = query_output.pooler_output if query_output.pooler_output is not None else tf.reduce_mean(query_output.last_hidden_state, axis=1)

# Convert TensorFlow tensor to numpy array and flatten it
query_embedding_np = query_embedding.numpy().flatten()

print("Query embedding shape:", query_embedding_np.shape)

# Number of nearest neighbors to find
n = 3
include_distances = True

# Load the pre-built Annoy index
index = AnnoyIndex(dimension, 'angular')
index.load('chunk_embeddings.ann')  # Make sure this is the correct path to your saved index

# Query the Annoy index
nearest_neighbors = index.get_nns_by_vector(query_embedding_np, n, include_distances=include_distances)

# Process and display the results
if include_distances:
    indices, distances = nearest_neighbors
    print("Indices of Nearest Neighbors:", indices)
    print("Distances:", distances)
else:
    indices = nearest_neighbors
    print("Indices of Nearest Neighbors:", indices)

# Assuming you have a mapping from indices to chunk texts or references
for idx in indices:
    # Retrieve and display the corresponding chunk text or a description
    print(f"Chunk {idx} text/reference: {chunk_index_to_text[idx]}")

Query embedding shape: (384,)
Indices of Nearest Neighbors: [25, 26, 24]
Distances: [0.5348873138427734, 0.6018552780151367, 0.6630282998085022]
Chunk 25 text/reference: food that was most often craved was pizza [2]. Certain readers extrapolated from this that  pizza’s allegedly addictive quality came from casomorphins, the tiny protein crystals  that result from the breakdown of casein, the primary protein of milkfat. True,  casomorphins can activate the human opioid system, just as drugs like morphine do, but  the degree is negligible and in most cases of cheese eating doesn’t occur at all. More  importantly, our brain registers with delight when it senses protein, fat, and salt, which  most cheeses contain to some degree. But the real reason for pizza’s position atop the  “craveability” index is the combination of carbohydrates and sugar (in the dough and
Chunk 26 text/reference: “craveability” index is the combination of carbohydrates and sugar (in the dough and  tomato sauce). In 