In [1]:
# Imports and Setup
import numpy as np
import os
import cv2
import pandas as pd
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from skimage.feature import local_binary_pattern
from sklearn.metrics.pairwise import cosine_similarity
import ast
import matplotlib.pyplot as plt
import zipfile

# Specify the path to the manually uploaded zip file
zip_file_path = "C:\Users\CODER\Downloads\archive.zip"  # Replace with the path to your zip file

# Extract the zip file to a specific directory
extracted_folder = "C:\Users\CODER\Downloads\archive_images"

# Unzip the uploaded file to the specified folder
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)

# Create the image directory
# The following line was changed. It previously assumed images were in a nested 'images_data' folder
image_dir = "/content/images_data/images"  # Now directly points to the extraction folder

# Verify the extraction
print(f"Data extracted to: {image_dir}")
print("Files in the directory:", os.listdir(image_dir))

# Ensure images are available
if not os.path.exists(image_dir):
    raise FileNotFoundError(f"Could not find the image directory: {image_dir}")

FileNotFoundError: [Errno 2] No such file or directory: '/content/images.zip'

In [4]:
# CNN Feature Extraction Setup
def setup_cnn_model():
    # Load the VGG16 model without the top layer
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
    cnn_model = Model(inputs=base_model.input, outputs=base_model.get_layer("block5_conv3").output)
    return cnn_model

def extract_cnn_features(image, cnn_model):
    # Resize image to 128x128
    image = cv2.resize(image, (128, 128))
    image = np.expand_dims(image, axis=0)
    image = image / 255.0  # Normalize image
    cnn_features = cnn_model.predict(image)
    return cnn_features.flatten()


In [5]:
# Micro-Structure Descriptor (MSD) - LBP Extraction
def extract_msd_features(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    lbp = local_binary_pattern(gray_image, P=8, R=1, method="uniform")  # Apply LBP
    (hist, _) = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10))  # LBP histogram
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-7)  # Normalize histogram
    return hist


In [6]:
# Hybrid Feature Extraction (Concatenate CNN and MSD Features)
def extract_hybrid_features(image, cnn_model):
    cnn_features = extract_cnn_features(image, cnn_model)
    msd_features = extract_msd_features(image)
    hybrid_features = np.concatenate((cnn_features, msd_features))  # Concatenate both feature sets
    return hybrid_features


In [40]:
from tqdm import tqdm  # Import tqdm for progress tracking

def create_feature_database(cnn_model, image_dir):
    import pandas as pd
    import os
    import numpy as np
    import cv2  # Ensure that OpenCV is imported

    database = []
    image_list = os.listdir(image_dir)

    # Initialize tqdm progress bar
    for i, image_name in enumerate(tqdm(image_list, desc="Processing Images")):
        image_path = os.path.join(image_dir, image_name)
        image = cv2.imread(image_path)

        if image is None:
            print(f"Warning: Unable to load image {image_name}. Skipping.")
            continue

        # Extract features using the hybrid model
        features = extract_hybrid_features(image, cnn_model)

        # Append image name and features to the database list
        database.append([image_name, features])

    # Save entire database to CSV after processing all images
    df = pd.DataFrame(database, columns=['image_name', 'features'])
    df.to_csv("feature_database.csv", index=False)

    # Print confirmation message for the saved file
    print(f"Saved the entire feature database to 'feature_database.csv' (processed {len(image_list)} images)")

In [33]:
import matplotlib.pyplot as plt
import cv2

def retrieve_similar_images(query_image, cnn_model, top_k=5):
    query_features = extract_hybrid_features(query_image, cnn_model).reshape(-1)  # Flatten to 1D
    print(f"Query feature vector size: {query_features.shape}")

    df = pd.read_csv("feature_database.csv")
    similarities = []

    for i, row in df.iterrows():
        db_features = np.fromstring(row['features'].strip('[]'), sep=',').reshape(-1)
        print(f"Database feature vector size for {row['image_name']}: {db_features.shape}")

        if db_features.shape != query_features.shape:
            print(f"Warning: Incompatible feature dimensions for {row['image_name']}. Skipping.")
            continue

        similarity = cosine_similarity([query_features], [db_features])[0][0]
        similarities.append((row['image_name'], similarity))

    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    return [item[0] for item in similarities[:top_k]]


In [9]:
# Display Top Similar Images
def show_images(images):
    fig, axes = plt.subplots(1, len(images), figsize=(20, 5))
    for ax, image_name in zip(axes, images):
        # Load the image
        image_path = os.path.join(image_dir, image_name)
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB for displaying with matplotlib
        ax.imshow(image)
        ax.set_title(image_name)
        ax.axis("off")  # Turn off axis

    plt.show()


In [16]:
import pandas as pd
import glob

def combine_chunks_to_csv(output_file="feature_database.csv"):
    # Find all chunk files
    chunk_files = glob.glob("feature_database_chunk_*.csv")
    df_list = [pd.read_csv(chunk) for chunk in chunk_files]

    # Concatenate all chunks into one DataFrame
    combined_df = pd.concat(df_list, ignore_index=True)

    # Save the combined DataFrame as a single CSV file
    combined_df.to_csv(output_file, index=False)
    print(f"Combined all chunks into {output_file}")


In [30]:
def show_images(image_paths):
    if not image_paths:
        print("No images to display.")
        return

    num_images = len(image_paths)
    fig, axes = plt.subplots(1, num_images, figsize=(15, 5))

    if num_images == 1:
        axes = [axes]

    for ax, image_path in zip(axes, image_paths):
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        ax.imshow(image)
        ax.axis('off')

    plt.show()

In [41]:
if __name__ == "__main__":
    cnn_model = setup_cnn_model()
    create_feature_database(cnn_model, image_dir)
    #combine_chunks_to_csv()

    query_image_path = "/content/images_data/images/100030.jpeg"
    query_image = cv2.imread(query_image_path)
    top_5_similar_images = retrieve_similar_images(query_image, cnn_model, top_k=5)
    show_images(top_5_similar_images)

Processing Images:   0%|          | 0/5362 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 519ms/step


Processing Images:   0%|          | 1/5362 [00:00<53:52,  1.66it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338ms/step


Processing Images:   0%|          | 2/5362 [00:01<44:52,  1.99it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 306ms/step


Processing Images:   0%|          | 3/5362 [00:01<41:52,  2.13it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


Processing Images:   0%|          | 4/5362 [00:01<34:09,  2.61it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step


Processing Images:   0%|          | 5/5362 [00:01<29:46,  3.00it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step


Processing Images:   0%|          | 6/5362 [00:02<27:04,  3.30it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step


Processing Images:   0%|          | 7/5362 [00:02<24:54,  3.58it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step


Processing Images:   0%|          | 8/5362 [00:02<23:52,  3.74it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step


Processing Images:   0%|          | 9/5362 [00:02<23:16,  3.83it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step


Processing Images:   0%|          | 10/5362 [00:03<22:50,  3.90it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


Processing Images:   0%|          | 11/5362 [00:03<22:05,  4.04it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step


Processing Images:   0%|          | 12/5362 [00:03<21:53,  4.07it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


Processing Images:   0%|          | 13/5362 [00:03<21:29,  4.15it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step


Processing Images:   0%|          | 14/5362 [00:04<22:15,  4.00it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


Processing Images:   0%|          | 15/5362 [00:04<21:44,  4.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step


Processing Images:   0%|          | 16/5362 [00:04<21:31,  4.14it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


Processing Images:   0%|          | 17/5362 [00:04<21:16,  4.19it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step


Processing Images:   0%|          | 18/5362 [00:05<21:42,  4.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step


Processing Images:   0%|          | 19/5362 [00:05<21:34,  4.13it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


Processing Images:   0%|          | 20/5362 [00:05<21:16,  4.19it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


Processing Images:   0%|          | 21/5362 [00:05<21:07,  4.21it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step


Processing Images:   0%|          | 22/5362 [00:06<21:29,  4.14it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


Processing Images:   0%|          | 23/5362 [00:06<21:22,  4.16it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step


Processing Images:   0%|          | 24/5362 [00:06<21:17,  4.18it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step


Processing Images:   0%|          | 25/5362 [00:06<21:21,  4.17it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step


Processing Images:   0%|          | 26/5362 [00:07<21:53,  4.06it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step


Processing Images:   1%|          | 27/5362 [00:07<22:02,  4.03it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step


Processing Images:   1%|          | 28/5362 [00:07<21:40,  4.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


Processing Images:   1%|          | 29/5362 [00:07<21:24,  4.15it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step


Processing Images:   1%|          | 30/5362 [00:07<21:25,  4.15it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step


Processing Images:   1%|          | 31/5362 [00:08<22:03,  4.03it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step


Processing Images:   1%|          | 32/5362 [00:08<21:40,  4.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step


Processing Images:   1%|          | 33/5362 [00:08<21:29,  4.13it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step


Processing Images:   1%|          | 34/5362 [00:08<21:24,  4.15it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step


Processing Images:   1%|          | 35/5362 [00:09<21:50,  4.06it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


Processing Images:   1%|          | 36/5362 [00:09<21:32,  4.12it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step


Processing Images:   1%|          | 37/5362 [00:09<21:19,  4.16it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step


Processing Images:   1%|          | 38/5362 [00:09<21:16,  4.17it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step


Processing Images:   1%|          | 39/5362 [00:10<21:48,  4.07it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step


Processing Images:   1%|          | 40/5362 [00:10<23:18,  3.81it/s]


KeyboardInterrupt: 