Generate Embeddings using Ollama LLaMA3

In [1]:
import pandas as pd

# Example DataFrame
data = {
    'Column1': ['Text 1', 'Text 2', 'Text 3'],
    'Column2': ['Additional text 1', 'Additional text 2', 'Additional text 3'],
    'Column3': ['Metadata 1', 'Metadata 2', 'Metadata 3'],
    'Column4': ['Label 1', 'Label 2', 'Label 3']
}

df = pd.DataFrame(data)


In [2]:
df['combined_text'] = df['Column1'] + ' ' + df['Column2']


In [3]:
from langchain_community.llms import Ollama
from sentence_transformers import SentenceTransformer

# Initialize the Ollama model
llama3 = Ollama(model="llama3")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
df['embeddings'] = df['combined_text'].apply(lambda x: embedding_model.encode(x))




In [4]:
df['embeddings']

0    [-0.008153303, 0.027126355, 0.009897539, 0.039...
1    [0.01855267, 0.04531752, 0.015019432, 0.040201...
2    [-0.020240506, 0.01578366, -0.03580045, 0.0033...
Name: embeddings, dtype: object

In [5]:
import chromadb

# try:
#     client = chromadb.Client()
# except Exception as e:
#     print(f"Failed to initialize ChromaDB client: {e}")
#     raise

# # Create or connect to a collection in ChromaDB
# try:
#     collection_name = "my_collection"
#     collection = client.create_collection(name=collection_name)
# except Exception as e:
#     print(f"Failed to create or connect to collection: {e}")
#     raise

# Initialize ChromaDB PersistentClient
try:
    client = chromadb.PersistentClient(path="my_chroma_db")
except Exception as e:
    print(f"Failed to initialize ChromaDB PersistentClient: {e}")
    raise

# Create or connect to a collection in ChromaDB
try:
    collection_name = "my_collection"
    collection = client.get_or_create_collection(name=collection_name)
except Exception as e:
    print(f"Failed to create or connect to collection: {e}")
    raise

In [6]:
collection

Collection(id=2ec75675-6a2f-4650-81b1-c0d3186fc855, name=my_collection)

In [7]:
# Inject data into ChromaDB
import numpy as np
for idx, row in df.iterrows():
    embedding_list = row['embeddings'].tolist() if isinstance(row['embeddings'], np.ndarray) else row['embeddings']
    print(f"Processing index: {idx}")  # Debugging statement
    try:
        collection.add(
            embeddings=[embedding_list],  # List of embeddings
            metadatas=[{
                "Column3": row['Column3'],  # Add any metadata you want to store
                "Column4": row['Column4']
            }],
            ids=[str(idx)]  # Unique ID for each entry
        )
    except Exception as e:
        print(f"Failed to add data to collection at index {idx}: {e}")
        raise

Processing index: 0
Processing index: 1
Processing index: 2


In [11]:
query_text = "Your query text here"
query_embedding = embedding_model.encode(query_text).tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2  # Number of closest matches to return
)

print(results)


{'ids': [['0', '1']], 'distances': [[1.311300161526672, 1.324933541568284]], 'metadatas': [[{'Column3': 'Metadata 1', 'Column4': 'Label 1'}, {'Column3': 'Metadata 2', 'Column4': 'Label 2'}]], 'embeddings': None, 'documents': [[None, None]], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}
