# Vectorization

## with api key

#### using openai model

In [None]:
from openai import OpenAI

client = OpenAI()

response = client.embedding.create(
    input="What is vector search?",
    model="text-embedding-ada-002"
)

print(response['data'][0]["embedding"])


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

* can't use it because i need an api key but the  api key is paid

#### groq

In [None]:
!pip install --upgrade groq




In [None]:
%env GROQ_API_KEY=""


env: GROQ_API_KEY=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


In [None]:
import os
import groq

# Set up your Groq API key
groq_api_key = os.getenv("GROQ_API_KEY")

# Initialize the Groq client
client = groq.Groq(api_key=groq_api_key)

# Generate an embedding using an available model
response = client.embeddings.create(
    model="gemma2-9b-it",  # Use an available embedding model
    input="What is vector search?"
)

# Print the embedding
print(response["data"][0]["embedding"])


BadRequestError: Error code: 400 - {'error': {'message': 'The model `gemma2-9b-it` does not support embeddings', 'type': 'invalid_request_error'}}

* can't use it because **groq** doesn't contain any embedding model, it only contains text generation models

#### cohere

In [None]:
!pip install cohere




In [None]:
import cohere

co = cohere.Client("")

response = co.embed(
    texts=["What is vector search?"],
    model="embed-english-v3.0",
    input_type="search_query"
)

print(response.embeddings[0])


[0.03488159, -0.024658203, 0.06390381, 0.03286743, -0.030563354, -0.012825012, 0.03012085, -0.043273926, 0.03591919, 0.01499939, -0.00036001205, 0.00019717216, -0.05895996, -0.030807495, -0.04272461, -0.014312744, 0.014701843, -0.039398193, -0.031311035, -0.017669678, 0.031173706, -0.0066719055, -0.012207031, 0.0231781, 0.00041913986, 0.021408081, -0.00605011, 0.0039520264, -0.027053833, -0.022232056, -0.013633728, 0.01637268, 0.009857178, -0.022064209, 0.08325195, 0.066101074, 0.021820068, -0.0033359528, 0.07269287, -0.032409668, 0.040496826, -0.02218628, -0.043395996, -0.00042819977, -0.019165039, -0.0211792, 0.013801575, -0.028884888, 0.039245605, 0.017181396, -0.014533997, 0.03982544, -0.0073280334, -0.023010254, -0.010498047, -0.027267456, 0.01083374, -0.015602112, 0.040039062, 0.04434204, -0.019256592, 0.013374329, 0.0037555695, -0.02633667, -0.015701294, 0.008132935, 0.019638062, 0.0016403198, -0.015991211, -0.058044434, -0.009681702, 0.015335083, 0.0029411316, 0.026229858, 0.01

## without api key

#### huggingface -> sentence-transformers/all-MiniLM-L6-v2

In [None]:
pip install sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

embedding = model.encode('What is vector search?')
print(embedding)


[-1.01225495e-01 -1.85940247e-02 -4.19105105e-02 -1.74863674e-02
 -6.04779739e-03 -4.43157069e-02  8.79230574e-02  8.96686688e-02
 -7.53913671e-02 -2.09458619e-02 -1.05445944e-02  8.49106982e-02
  2.18059234e-02  1.09448601e-02  4.79193106e-02  6.22075349e-02
 -5.84316887e-02 -5.29652461e-02 -2.14621164e-02 -1.11409230e-03
 -4.78209183e-02  2.22933870e-02  1.89830400e-02  3.04225367e-02
  3.26103382e-02 -2.36263294e-02 -1.16369324e-02  2.70534661e-02
 -1.21020349e-02 -1.91449393e-02 -6.05077222e-02  8.22210312e-02
  3.17204855e-02  7.41585270e-02 -8.20398554e-02  3.95620540e-02
 -4.68588136e-02  1.57278478e-02  7.71197528e-02 -1.20251207e-02
  1.97246410e-02 -4.98133712e-02  2.19064839e-02 -5.48141785e-02
  8.61734524e-03 -5.77492360e-03 -9.28892866e-02 -4.53039557e-02
  5.77700213e-02  1.09613058e-03  1.70195326e-02 -5.17448075e-02
  4.59005684e-02 -7.27314055e-02 -8.09270516e-02 -1.00676492e-01
 -2.56426670e-02  4.66742478e-02 -1.24654341e-02 -9.09025595e-02
  3.19055915e-02 -4.16801

* it works perfectly to vectorise the input text

### transformers import AutoModel, AutoTokenizer


#### huggingface -> sentence-transformers/all-mpnet-base-v2

In [None]:
from transformers import AutoModel, AutoTokenizer

# Load the embedding model
model_name = "sentence-transformers/all-mpnet-base-v2"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Text to embed
text = "This is a sample text to embed."

# Encode text
encoded_input = tokenizer(text, return_tensors="pt")

# Generate embedding
embeddings = model(**encoded_input)[0].detach().numpy()
print(embeddings)


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

[[[ 0.01316915 -0.22301438  0.00281189 ...  0.11696119 -0.098828
   -0.100347  ]
  [-0.03007325 -0.02289394 -0.13221411 ...  0.1366807  -0.11478014
   -0.02674907]
  [-0.07930624 -0.17268611 -0.1157254  ...  0.03090155 -0.1421602
   -0.00160188]
  ...
  [-0.06055696 -0.05994808 -0.11521682 ... -0.01276925 -0.20267352
   -0.12080637]
  [-0.0024899  -0.20914543  0.02022442 ...  0.13068455 -0.06564921
   -0.10124038]
  [-0.00583597 -0.18217182  0.02568251 ...  0.19852434 -0.08623807
   -0.05485805]]]


* it also works fine

### gensim.model -> Word2Vec

In [None]:
from gensim.models import Word2Vec

# Sample sentences
sentences = [["what", "is", "vector", "search"], ["another", "sentence"]]

# Train a Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Get the embedding for a word
embedding = model.wv['vector']

print(embedding)


[-8.2426779e-03  9.2993546e-03 -1.9766092e-04 -1.9672764e-03
  4.6036304e-03 -4.0953159e-03  2.7431143e-03  6.9399667e-03
  6.0654259e-03 -7.5107943e-03  9.3823504e-03  4.6718083e-03
  3.9661205e-03 -6.2435055e-03  8.4599797e-03 -2.1501649e-03
  8.8251876e-03 -5.3620026e-03 -8.1294188e-03  6.8245591e-03
  1.6711927e-03 -2.1985089e-03  9.5136007e-03  9.4938548e-03
 -9.7740470e-03  2.5052286e-03  6.1566923e-03  3.8724565e-03
  2.0227872e-03  4.3050171e-04  6.7363144e-04 -3.8206363e-03
 -7.1402504e-03 -2.0888723e-03  3.9238976e-03  8.8186832e-03
  9.2591504e-03 -5.9759365e-03 -9.4026709e-03  9.7643770e-03
  3.4297847e-03  5.1661171e-03  6.2823449e-03 -2.8042626e-03
  7.3227035e-03  2.8302716e-03  2.8710044e-03 -2.3803699e-03
 -3.1282497e-03 -2.3701417e-03  4.2764368e-03  7.6057913e-05
 -9.5842788e-03 -9.6655441e-03 -6.1481940e-03 -1.2856961e-04
  1.9974159e-03  9.4319675e-03  5.5843508e-03 -4.2906962e-03
  2.7831673e-04  4.9643586e-03  7.6983096e-03 -1.1442233e-03
  4.3234206e-03 -5.81437

* it is working fine

### gensim.model -> FastText

In [None]:
from gensim.models import FastText

# Sample sentences
sentences = [["what", "is", "vector", "search"], ["another", "sentence"]]

# Train a FastText model
model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Get the embedding for a word
embedding = model.wv['vector']

print(embedding)


[-1.5408301e-03 -4.9328146e-04  1.7393114e-03 -1.3100719e-03
 -3.4143988e-03 -1.0303302e-03 -8.5281295e-04  7.9981465e-04
  2.6819198e-03  4.3569750e-04  1.9210356e-03  6.1423285e-04
 -3.8632530e-05 -7.6028693e-04  1.4215680e-03 -8.0906897e-04
 -3.4251285e-04 -6.3971023e-04 -1.0659266e-03  1.7595564e-03
 -5.8471155e-04 -8.5550535e-04  1.7978337e-03  2.4926551e-03
  4.1390481e-04  1.3032620e-03 -4.2533968e-04  1.0619773e-06
 -8.9843634e-05 -6.2009750e-04  5.0542213e-04 -4.2737718e-04
 -8.1521587e-04 -2.1871706e-03  1.6322605e-03  6.5732392e-04
  3.3991726e-04  1.0092306e-03  2.0038259e-04  4.5933379e-04
  2.9547638e-04 -1.1998692e-03 -5.3077517e-04  8.6982967e-04
 -1.3409401e-04  2.3891375e-04  1.4857424e-03  1.2092101e-03
  2.8555540e-04 -3.3923276e-04 -1.0517765e-03 -2.1772275e-03
 -1.9793510e-03 -1.8537411e-03 -1.8320656e-03 -4.7526302e-04
  2.9657600e-05  1.7344997e-03  3.4186593e-04  5.8577763e-04
 -1.8919497e-04 -1.3176784e-03 -4.5953383e-04 -3.4713111e-04
 -1.1433285e-03  3.49333

* it is working fine

# vectorization and search

In [None]:
! pip install pinecone-client cohere tiktoken


Collecting pinecone-client
  Using cached pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Using cached pinecone_client-6.0.0-py3-none-any.whl (6.7 kB)
Installing collected packages: pinecone-client
Successfully installed pinecone-client-6.0.0


In [None]:
!pip uninstall pinecone-client -y
!pip install pinecone


Found existing installation: pinecone-client 6.0.0
Uninstalling pinecone-client-6.0.0:
  Successfully uninstalled pinecone-client-6.0.0


#### sentence_transformers - embedding a .txt file and exporting into a different file

In [None]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Read text from a file
file_path = "/content/InterviewAI.txt"  # Update this path if needed

with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

# Generate vector embeddings
embeddings = model.encode(text)

# Print the embeddings
print(embeddings)

# Optionally, save embeddings to a file
import numpy as np
np.save("/content/embeddings.npy", embeddings)  # Saves the embeddings as a .npy file


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[-1.04820989e-01  1.03785276e-01  7.90972412e-02 -1.00783771e-02
  3.50263491e-02 -8.26185942e-02  5.04296049e-02  2.14332100e-02
 -8.30293298e-02 -5.10687791e-02 -8.93853158e-02 -9.83044580e-02
 -4.49071079e-02  3.34324501e-02 -3.71591747e-02  3.68269868e-02
  1.16976695e-02 -6.40218481e-02 -1.03100225e-01  7.66067579e-02
 -2.11271271e-02 -4.91322316e-02  7.29440898e-02 -4.08595148e-03
 -1.05583873e-02  6.24688901e-02  1.10107407e-01  7.69024491e-02
 -1.71896685e-02 -5.73665611e-02 -6.52296990e-02  2.48742960e-02
  9.34705362e-02 -4.26395424e-02  5.96651733e-02  8.50777701e-02
 -6.64869621e-02 -4.87590991e-02  7.48110935e-04 -1.15142073e-02
  8.38959403e-03 -7.19799697e-02  1.42125068e-02 -9.49594472e-03
 -7.63253961e-03 -1.06776878e-01  2.56259646e-02 -1.51814390e-02
 -6.56835502e-03  4.13667131e-03 -1.14555471e-01 -7.76250586e-02
  1.04549620e-02  2.02952102e-02 -1.38145976e-03  3.39577161e-02
 -2.68809590e-02 -6.97758496e-02 -1.21382074e-02 -4.38325182e-02
 -1.73075963e-02 -1.47618

#### sentence_transformers - creating chunks and then embedding a .txt file and exporting into a different file

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Function to split text into smaller chunks
def chunk_text(text, max_length=256):
    sentences = text.split(". ")  # Split by sentences
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "

    if current_chunk:
        chunks.append(current_chunk.strip())  # Append last chunk

    return chunks

# Read text from file
file_path = "/content/InterviewAI.txt"  # Update this path if needed

with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

# Chunk the text
chunks = chunk_text(text, max_length=256)  # Adjust max_length as needed

# Generate vector embeddings for each chunk
embeddings = model.encode(chunks)

# Save embeddings
np.save("/content/embeddings.npy", embeddings)

# Print number of chunks and shape of embeddings
print(f"Total chunks: {len(chunks)}")
print(f"Embeddings shape: {embeddings.shape}")

# Optional: Download the embeddings
from google.colab import files
files.download("/content/embeddings.npy")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Total chunks: 91
Embeddings shape: (91, 384)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##### API KEYs

In [None]:
# COHERE_API_KEY = ""
# YOUR_PINECONE_API_KEY = ""
# PINECONE_ENV = ""


In [None]:
# import os
# os.environ["PINECONE_API_KEY"] = ""


In [None]:
# print(repr(os.environ["PINECONE_API_KEY"]))


'pcsk_2LKRhA_ApLK3JJ5JrBPrcBKb8aPTpmpnkCRzkLQZGyW6rXsThECokETPL3qRAfhMJGKmiX'


## ChromaDB

In [None]:
! pip install chromadb


Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.19.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.30.0-py

In [None]:
!pip install PyPDF2


Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import chromadb
import numpy as np
from sentence_transformers import SentenceTransformer
import PyPDF2 # import the PyPDF2 library

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Function to split text into smaller chunks
def chunk_text(text, max_length=256):
    sentences = text.split(". ")  # Split by sentences
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "

    if current_chunk:
        chunks.append(current_chunk.strip())  # Append last chunk

    return chunks

# # Read text from file
# file_path = "/content/InterviewAI.txt"  # Update this path if needed

# with open(file_path, "r", encoding="utf-8") as file:
#     text = file.read()

# Read text from PDF file
file_path = "/content/svm.pdf"  # Update this path if needed

text = ""
with open(file_path, "rb") as file:  # Open in binary read mode 'rb'
    pdf_reader = PyPDF2.PdfReader(file)
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()

# Chunk the text
chunks = chunk_text(text, max_length=256)  # Adjust max_length as needed

# Generate vector embeddings for each chunk
embeddings = model.encode(chunks)

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Creates a persistent database

# Create or get a collection
collection = chroma_client.get_or_create_collection(name="text_embeddings")

# Add data to ChromaDB
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
    collection.add(
        ids=[str(i)],  # Unique ID for each chunk
        embeddings=[embedding.tolist()],  # Convert NumPy array to list
        metadatas=[{"text": chunk}]  # Store original text as metadata
    )

print("Embeddings successfully stored in ChromaDB!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings successfully stored in ChromaDB!


In [None]:
print(f"Successfully stored {len(chunks)} text chunks in ChromaDB!")


Successfully stored 208 text chunks in ChromaDB!


In [None]:
# query_text = "What is the interview process like?"
query_text = "General Requirements."  # Example query
query_embedding = model.encode([query_text])

# Search in ChromaDB
results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=1  # Get top 3 similar chunks
)

print("Search Results:")
for result in results["metadatas"][0]:
    print(result["text"])


Search Results:
Information that will permit the Office to make a
reasonable cost/benefit analysis as to the proposed Qualified
Production, to determine whether such project qualifies for
Incentives, and to determine the maximum amount of Incentives to
be awarded.
8.


In [None]:
# Fetch all data from the collection
stored_data = collection.get()

print("Stored Data Keys:", stored_data.keys())  # Shows available keys
print("Stored Data Sample:", stored_data)  # Full data


Stored Data Keys: dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'included'])
Stored Data Sample: {'ids': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '13

In [None]:
doc_id = "1"  # Change based on your stored IDs
document = collection.get(ids=[doc_id])

print("Retrieved Document:", document)


Retrieved Document: {'ids': ['1'], 'embeddings': None, 'documents': [None], 'uris': None, 'data': None, 'metadatas': [{'text': 'Film Office)\nALABAMA DEPARTMENT OF COMMERCE\nADMINISTRATIVE CODE\nCHAPTER 281-3-1\nALABAMA FILM OFFICE INCENTIVES\nTABLE OF CONTENTS\n281-3-1-.01 Scope Of Rules\n281-3-1-.02 Definitions\n281-3-1-.03 General Requirements\n281-3-1-.04 Selection Of State - Certified\nProductions\n281-3-1-.05 Continuing Requirements\n281-3-1-.06 Final Incentives Audit And The Report\n281-3-1-.07 Payment Of The Rebate And Failure To\nMeet Qualifications\n281-3-1-.08 Production Expenditures\n281-3-1-.09 Chart Of Production Expenditures\n281-3-1-.01 Scope Of Rules.\n(1) Division 281-3-l of the Administrative Code sets\nforth the rules to be used by the Department of Commerce in the\nadministration of Acts of Alabama 2009-144, as codified in Code\nof Ala.'}], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [None]:
from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Query text
# query_text = "What is the interview process like?"
query_text = "What are the types available in SVM?"
query_embedding = model.encode([query_text])

# Search in ChromaDB
results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=3  # Get top 3 similar chunks
)

print("Query Results:", results)


Query Results: {'ids': [['41', '42', '51']], 'embeddings': None, 'documents': [[None, None, None]], 'uris': None, 'data': None, 'metadatas': [[{'text': 'Additionally, since some features had high variance, I used feature selection techniques like mutual information and SHAP-based ranking to retain only the most informative predictors.'}, {'text': 'Model performance was another challenge, as initial models suffered from imbalanced classes. I tackled this using a combination of SMOTE for oversampling and class-weighted loss functions in neural networks.'}, {'text': 'For real-time inference, I optimize deployment using ONNX or TensorRT to reduce latency while maintaining accuracy. Additionally, I use feature stores like Feast to ensure consistency in feature engineering across training and production environments.'}]], 'distances': [[1.467194511098861, 1.5225515229172002, 1.5240218344674767]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <Includ

In [None]:
!pip install pdfplumber


Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import chromadb
import numpy as np
from sentence_transformers import SentenceTransformer
import pdfplumber

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Function to split text into smaller chunks
# def chunk_text(text, max_length=256):
#     sentences = text.split(". ")  # Split by sentences
#     chunks = []
#     current_chunk = ""

#     for sentence in sentences:
#         if len(current_chunk) + len(sentence) <= max_length:
#             current_chunk += sentence + ". "
#         else:
#             chunks.append(current_chunk.strip())
#             current_chunk = sentence + ". "

#     if current_chunk:
#         chunks.append(current_chunk.strip())  # Append last chunk

#     return chunks

# Read text from PDF file using pdfplumber
file_path = "/content/2022_04_RULES_REGS_281-3-1-1 (1).pdf"  # Update this path if needed

text = ""
try:
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + "\n"  # Add newline between pages
            else:
                print(f"Warning: No text extracted from page {page.page_number}")
except FileNotFoundError:
    print(f"Error: File '{file_path}' not found.")
    exit()

# Debugging: Print the extracted text to verify
print("Extracted Text (first 500 characters):")
print(text[:500])
print(f"Total length of extracted text: {len(text)}")

# Chunk the text
chunks = chunk_text(text, max_length=256)  # Adjust max_length as needed

# Debugging: Print the chunks to verify
print("\nChunks:")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i}: '{chunk}' (Length: {len(chunk)})")

# Generate vector embeddings for each chunk
embeddings = model.encode(chunks)

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Creates a persistent database

# Create or get a collection
collection = chroma_client.get_or_create_collection(name="text_embeddings")

# Add data to ChromaDB
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
    collection.add(
        ids=[str(i)],  # Unique ID for each chunk
        embeddings=[embedding.tolist()],  # Convert NumPy array to list
        metadatas=[{"text": chunk}]  # Store original text as metadata
    )

print("Embeddings successfully stored in ChromaDB!")
print(f"Successfully stored {len(chunks)} text chunks in ChromaDB!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Extracted Text (first 500 characters):
Commerce (Formerly Ala. Development Office Chapter 281-3-1
And Ala. Tourism Department/Ala. Film Office)
ALABAMA DEPARTMENT OF COMMERCE
ADMINISTRATIVE CODE
CHAPTER 281-3-1
ALABAMA FILM OFFICE INCENTIVES
TABLE OF CONTENTS
281-3-1-.01 Scope Of Rules
281-3-1-.02 Definitions
281-3-1-.03 General Requirements
281-3-1-.04 Selection Of State - Certified
Productions
281-3-1-.05 Continuing Requirements
281-3-1-.06 Final Incentives Audit And The Report
281-3-1-.07 Payment Of The Rebate And Failure To
Meet 
Total length of extracted text: 71458

Chunks:
Chunk 0: 'Commerce (Formerly Ala. Development Office Chapter 281-3-1
And Ala. Tourism Department/Ala.' (Length: 91)
Chunk 1: 'Film Office)
ALABAMA DEPARTMENT OF COMMERCE
ADMINISTRATIVE CODE
CHAPTER 281-3-1
ALABAMA FILM OFFICE INCENTIVES
TABLE OF CONTENTS
281-3-1-.01 Scope Of Rules
281-3-1-.02 Definitions
281-3-1-.03 General Requirements
281-3-1-.04 Selection Of State - Certified
Productions
281-3-1-.05 Continu



Embeddings successfully stored in ChromaDB!
Successfully stored 208 text chunks in ChromaDB!
