In [2]:
# Standard library imports
import os
import logging
from typing import List, Dict, Any, Optional

# Data processing imports
import numpy as np
import pandas as pd

# Deep learning imports
import torch
from transformers import (
    AutoTokenizer, 
    AutoModel,
    AutoModelForSequenceClassification
)
from sentence_transformers import SentenceTransformer

# Vector database imports
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import (
    VectorParams, 
    Distance, 
    PointStruct, 
    Filter, 
    FieldCondition, 
    MatchValue
)

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Vectorization

In [3]:
def initialize_model(model_name: str = "havocy28/VetBERT") -> tuple:
    """
    Initialize the VetBERT model and tokenizer.
    
    Args:
        model_name (str): Name of the pretrained model
        
    Returns:
        tuple: (tokenizer, model, device)
        
    Raises:
        RuntimeError: If model initialization fails
    """
    try:
        # Initialize tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        
        # Set up device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        
        logger.info(f"Model initialized successfully on {device}")
        return tokenizer, model, device
        
    except Exception as e:
        logger.error(f"Failed to initialize model: {str(e)}")
        raise RuntimeError(f"Model initialization failed: {str(e)}")

# Initialize model and tokenizer
try:
    tokenizer, model, device = initialize_model()
except Exception as e:
    logger.error(f"Model setup failed: {str(e)}")
    raise

2025-09-12 17:00:41,362 - INFO - Model initialized successfully on cpu


In [6]:
def get_vetbert_embeddings(
    texts: List[str],
    tokenizer: AutoTokenizer,
    model: AutoModel,
    device: torch.device,
    max_length: int = 512
) -> np.ndarray:
    """
    Generate VetBERT embeddings for a list of texts.
    
    Args:
        texts (List[str]): List of input texts
        tokenizer: VetBERT tokenizer
        model: VetBERT model
        device: Torch device
        max_length (int): Maximum sequence length
        
    Returns:
        np.ndarray: Text embeddings
        
    Raises:
        ValueError: If input texts are invalid
        RuntimeError: If embedding generation fails
    """
    try:
        if not texts:
            raise ValueError("Empty text list provided")
            
        # Tokenize the input texts
        inputs = tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length
        )
        
        # Move inputs to device
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            
        # Use CLS token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        embeddings = cls_embedding.cpu().numpy()
        
        logger.info(f"Generated embeddings for {len(texts)} texts")
        return embeddings
        
    except Exception as e:
        logger.error(f"Failed to generate embeddings: {str(e)}")
        raise RuntimeError(f"Embedding generation failed: {str(e)}")

In [7]:
def load_and_process_data(file_path: str) -> pd.DataFrame:
    """
    Load and preprocess the dataset.
    
    Args:
        file_path (str): Path to the CSV file
        
    Returns:
        pd.DataFrame: Processed clinical notes data with embeddings
        
    Raises:
        FileNotFoundError: If data file doesn't exist
        ValueError: If data processing fails
    """
    try:
        # Check file existence
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Data file not found: {file_path}")
            
        # Load data
        data = pd.read_csv(file_path)
        if data.empty:
            raise ValueError("Empty dataset")
            
        # Preprocess data
        data = data.applymap(lambda x: x.lower() if isinstance(x, str) else x)
        data_clinic = data[data['record_type'] == 'clinical notes']
        
        if data_clinic.empty:
            raise ValueError("No clinical notes found in dataset")
            
        # Generate embeddings
        texts_clinic = data_clinic['text'].tolist()
        embeddings = get_vetbert_embeddings(texts_clinic, tokenizer, model, device)
        
        # Add embeddings to DataFrame
        data_clinic['vetbert_vector'] = list(embeddings)
        
        logger.info(f"Processed {len(data_clinic)} clinical notes")
        return data_clinic
        
    except Exception as e:
        logger.error(f"Data processing failed: {str(e)}")
        raise

# Load and process data
try:
    data_clinic = load_and_process_data('Raw_Data/pet-health-symptoms-dataset.csv')
    print(f"Dataset shape: {data_clinic.shape}")
except Exception as e:
    logger.error(f"Failed to load and process data: {str(e)}")
    raise

  data = data.applymap(lambda x: x.lower() if isinstance(x, str) else x)
2025-09-12 17:04:33,639 - INFO - Generated embeddings for 1000 texts
2025-09-12 17:04:33,639 - INFO - Generated embeddings for 1000 texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clinic['vetbert_vector'] = list(embeddings)
2025-09-12 17:04:33,970 - INFO - Processed 1000 clinical notes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clinic['vetbert_vector'] = list(embeddings)
2025-09-12 17:04:33,970 - INFO - Processed 1000 clinical notes


Dataset shape: (1000, 4)


In [10]:
data_clinic['Dimension'] = data_clinic['vetbert_vector'].map(len)

In [11]:
# Extract the vector from the Series
vector = data_clinic.iloc[0]['vetbert_vector']

# Print the vector
print(len(vector))

768


In [13]:
data_clinic.groupby('Dimension').size().reset_index().iloc[0]['Dimension']

768

In [14]:
#save the data with the embeding vector
data_clinic.to_pickle('data_Clinic.pkl')

# Vector Data base

In [15]:
#read the vector csv
data_clinic=pd.read_pickle('data_Clinic.pkl')

In [17]:
# Adding special index to each row in a databse
converted_df = [
    {
        "id": idx + 1,
        "vector":row["vetbert_vector"],
        "text": row["text"],
        "condition": row["condition"]
    }
    for idx, row in data_clinic.iterrows()
]

In [18]:
def initialize_qdrant(
    host: str = "localhost",
    port: int = 6333,
    collection_name: str = "vet_notes",
    vector_size: int = None
) -> tuple:
    """
    Initialize Qdrant vector database.
    
    Args:
        host (str): Qdrant server host
        port (int): Qdrant server port
        collection_name (str): Name of the collection
        vector_size (int): Size of vectors
        
    Returns:
        tuple: (QdrantClient, collection_name)
        
    Raises:
        ConnectionError: If connection to Qdrant fails
        ValueError: If vector size is invalid
    """
    try:
        # Connect to Qdrant
        client = QdrantClient(url=f"http://{host}:{port}")
        
        # Validate vector size
        if not vector_size:
            vector_size = data_clinic.groupby('Dimension').size().reset_index().iloc[0]['Dimension']
        
        if vector_size <= 0:
            raise ValueError("Invalid vector size")
            
        # Create or recreate collection
        client.recreate_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(
                size=vector_size,
                distance=models.Distance.COSINE
            )
        )
        
        logger.info(f"Initialized Qdrant collection '{collection_name}' with vector size {vector_size}")
        return client, collection_name
        
    except Exception as e:
        logger.error(f"Failed to initialize Qdrant: {str(e)}")
        raise

# Initialize Qdrant
try:
    client, collection_name = initialize_qdrant()
except Exception as e:
    logger.error(f"Qdrant initialization failed: {str(e)}")
    raise

2025-09-12 17:07:57,374 - INFO - HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"
  client.recreate_collection(
2025-09-12 17:07:57,841 - INFO - HTTP Request: DELETE http://localhost:6333/collections/vet_notes "HTTP/1.1 200 OK"
2025-09-12 17:07:58,699 - INFO - HTTP Request: PUT http://localhost:6333/collections/vet_notes "HTTP/1.1 200 OK"
2025-09-12 17:07:58,701 - INFO - Initialized Qdrant collection 'vet_notes' with vector size 768


In [19]:
def prepare_points(data: List[Dict]) -> List[PointStruct]:
    """
    Prepare data points for Qdrant vector insertion.
    
    Args:
        data: List of dictionaries with keys 'id', 'vector', 'text', 'condition'
        
    Returns:
        List[PointStruct]: Prepared points for Qdrant
        
    Raises:
        ValueError: If input data is invalid
    """
    try:
        if not data:
            raise ValueError("Empty data provided")
            
        points = []
        for record in data:
            # Validate required fields
            required_fields = ['id', 'vector', 'text', 'condition']
            if not all(field in record for field in required_fields):
                raise ValueError(f"Missing required fields. Need {required_fields}")
                
            point = PointStruct(
                id=record['id'],
                vector=record['vector'],
                payload={
                    "text": record['text'],
                    "condition": record['condition']
                }
            )
            points.append(point)
            
        logger.info(f"Prepared {len(points)} points for insertion")
        return points
        
    except Exception as e:
        logger.error(f"Failed to prepare points: {str(e)}")
        raise

def insert_vectors(
    client: QdrantClient,
    collection_name: str,
    data: pd.DataFrame
) -> None:
    """
    Insert vectors into Qdrant collection.
    
    Args:
        client: Qdrant client
        collection_name: Name of the collection
        data: DataFrame with vector data
        
    Raises:
        RuntimeError: If vector insertion fails
    """
    try:
        # Convert DataFrame to list of dictionaries
        converted_df = [
            {
                "id": idx + 1,
                "vector": row["vetbert_vector"],
                "text": row["text"],
                "condition": row["condition"]
            }
            for idx, row in data.iterrows()
        ]
        
        # Prepare and insert points
        points = prepare_points(converted_df)
        client.upsert(collection_name=collection_name, points=points)
        
        logger.info(f"Successfully inserted {len(points)} vectors")
        
    except Exception as e:
        logger.error(f"Failed to insert vectors: {str(e)}")
        raise RuntimeError(f"Vector insertion failed: {str(e)}")

# Insert vectors into Qdrant
try:
    insert_vectors(client, collection_name, data_clinic)
    
    # Verify insertion
    collection_info = client.get_collection(collection_name=collection_name)
    print(f"Number of points in collection: {collection_info.points_count}")
except Exception as e:
    logger.error(f"Vector database population failed: {str(e)}")
    raise

2025-09-12 17:08:01,995 - INFO - Prepared 1000 points for insertion
2025-09-12 17:08:03,969 - INFO - HTTP Request: PUT http://localhost:6333/collections/vet_notes/points?wait=true "HTTP/1.1 200 OK"
2025-09-12 17:08:03,982 - INFO - Successfully inserted 1000 vectors
2025-09-12 17:08:04,039 - INFO - HTTP Request: GET http://localhost:6333/collections/vet_notes "HTTP/1.1 200 OK"


Number of points in collection: 1000


In [20]:
#Check the vectorDatabase created

collection_name = "vet_notes"

collection_info = client.get_collection(collection_name=collection_name)
print(f"Number of points in the collection: {collection_info.points_count}")

2025-09-12 17:09:34,997 - INFO - HTTP Request: GET http://localhost:6333/collections/vet_notes "HTTP/1.1 200 OK"


Number of points in the collection: 1000
