### 1. Import required libraries

In [None]:
import cv2
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import os
from tqdm import tqdm

### 2. Feature Extraction (Dense SIFT)

In [None]:
def extract_dense_sift_features(image_path, step=8, size=16):
    """
    Extract dense SIFT features from an image.
    """
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        return None

    sift = cv2.SIFT_create()
    keypoints = [cv2.KeyPoint(x, y, size) 
                 for y in range(0, image.shape[0], step) 
                 for x in range(0, image.shape[1], step)]
    _, descriptors = sift.compute(image, keypoints)
    return descriptors

### 3. Build the K-means Codebook

In [1]:
def build_codebook(data_path, n_clusters=500, sample_size=100000):
    """
    Build a visual vocabulary using K-means clustering.
    """
    all_descriptors = []
    class_folders = [os.path.join(data_path, class_dir) for class_dir in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, class_dir))]

    for folder in class_folders:
        for image_file in tqdm(os.listdir(folder)):
            image_path = os.path.join(folder, image_file)
            descriptors = extract_dense_sift_features(image_path)
            if descriptors is not None:
                all_descriptors.append(descriptors)

    # Stack all descriptors into a single array
    all_descriptors = np.vstack(all_descriptors)
    print(f"Total descriptors: {all_descriptors.shape[0]}")

    # Randomly sample 100k descriptors (or fewer if the dataset is small)
    sampled_descriptors = all_descriptors[np.random.choice(all_descriptors.shape[0], min(sample_size, all_descriptors.shape[0]), replace=False)]

    # Apply K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, verbose=1)
    kmeans.fit(sampled_descriptors)
    
    return kmeans

### 4. Quantize Features and Build Histograms

In [None]:
def quantize_and_create_histograms(data_path, kmeans):
    """
    Quantize image features using the codebook and create histograms.
    """
    histograms = []
    labels = []

    class_folders = [os.path.join(data_path, class_dir) for class_dir in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, class_dir))]

    for label, folder in enumerate(class_folders):
        for image_file in tqdm(os.listdir(folder)):
            image_path = os.path.join(folder, image_file)
            descriptors = extract_dense_sift_features(image_path)
            if descriptors is None:
                continue
            
            # Quantize descriptors to nearest cluster centers
            cluster_assignments = kmeans.predict(descriptors)
            
            # Create histogram
            histogram, _ = np.histogram(cluster_assignments, bins=range(kmeans.n_clusters + 1), density=True)
            histograms.append(histogram)
            labels.append(label)
    
    return np.array(histograms), np.array(labels)


### Full implementation example

In [None]:
if __name__ == "__main__":
    train_data_path = "path_to_training_data"  # Replace with your dataset path
    test_data_path = "path_to_testing_data"

    # Step 1: Build the visual vocabulary
    print("Building the visual vocabulary...")
    kmeans = build_codebook(train_data_path, n_clusters=500, sample_size=100000)

    # Step 2: Quantize features and create histograms
    print("Creating histograms for training data...")
    train_histograms, train_labels = quantize_and_create_histograms(train_data_path, kmeans)

    print("Creating histograms for testing data...")
    test_histograms, test_labels = quantize_and_create_histograms(test_data_path, kmeans)

    print(f"Train histograms shape: {train_histograms.shape}")
    print(f"Test histograms shape: {test_histograms.shape}")
