## Classification Code

## Imports

In [5]:
# Open Image
from PIL import Image

# For text extraction
import pytesseract
import easyocr

# For EfficientNet
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input, decode_predictions
import cv2

# For Caption Generation
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch

from torchvision import transforms, models
import numpy as np

## Open Image and OCR

In [12]:
image_path = 'watch.png'

# Load the image
image = Image.open(image_path).convert("RGB")

pytesseract.pytesseract.tesseract_cmd = r'C:\Users\djord\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'  # Replace with your Tesseract path, if necessary
# Text extraction with Tesseract and EasyOCR
text_tesseract = pytesseract.image_to_string(image)
reader = easyocr.Reader(['en'])
results = reader.readtext(image_path)

# Extract text from the results and join it into a single string
text_easyocr = ' '.join(result[1] for result in results)

## EfficientNet pre-trained on ImageNet

In [13]:
# Image classification with pretrained EfficientNet on ImageNet data
model = EfficientNetB0(weights='imagenet')

# EfficientNet preprocessing.
img = cv2.imread(image_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
img = cv2.resize(img, (224, 224))  # Resize to the input size of EfficientNetB0
img = preprocess_input(img)  # Preprocess according to EfficientNet requirements

# Expand dimensions to create a batch (expects batch input)
img = tf.expand_dims(img, axis=0)

# Make predictions on the image
predictions = model.predict(img)

# Decode and display the top-5 predicted classes
decoded_predictions = decode_predictions(predictions, top=5)[0]
efficientNet = []
for prediction in decoded_predictions:
    class_label, probability = prediction[1], prediction[2]
    efficientNet.append(f'{class_label}: {probability:.2%}')

## MobileNetV2 custom trained on "Retail Products Classification"

In [14]:
# Custom image classification
# Define the class labels
class_labels = [
    'Arts, Crafts & Sewing', 'Cell Phones & Accessories', 'Clothing, Shoes & Jewelry',
    'Tools & Home Improvement', 'Health & Personal Care', 'Baby Products', 'Baby',
    'Patio, Lawn & Garden', 'Beauty', 'Sports & Outdoors', 'Electronics', 'All Electronics',
    'Automotive', 'Toys & Games', 'All Beauty', 'Office Products', 'Appliances',
    'Musical Instruments', 'Industrial & Scientific', 'Grocery & Gourmet Food', 'Pet Supplies'
]

# Define the MobileNetV2 model architecture
model = torch.hub.load('pytorch/vision', 'mobilenet_v2', weights=False)  # Load MobileNetV2
num_features = model.classifier[1].in_features  # Get the number of input features for the classifier
model.classifier[1] = torch.nn.Linear(num_features, len(class_labels))  # Modify the classifier for your number of classes

# Load pretrained weights
model.load_state_dict(torch.load('categorization.pth', map_location=torch.device('cpu'))) 
model.eval()  # Set the model to evaluation mode

# Update the transform for data preprocessing
transform = transforms.Compose([
    transforms.Resize((100, 100)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
imageMobileNet = transform(image)
imageMobileNet = imageMobileNet.unsqueeze(0)  # Add batch dimension

# Make predictions
with torch.no_grad():
    logits = model(imageMobileNet)

# Convert logits to class probabilities
probs = torch.softmax(logits, dim=1)
probs = probs.numpy()[0]
# Map class probabilities to class labels
top_classes = np.argsort(probs)[::-1][:3]  # Get the indices of the top 3 classes
predicted_category = [class_labels[class_idx] for class_idx in top_classes]
predicted_probabilities = [probs[class_idx] for class_idx in top_classes]



Using cache found in C:\Users\djord/.cache\torch\hub\pytorch_vision_main


## BLIP using the BLIP-2, OPT-2.7b checkpoint 
Can also use smaller checkpoints. Use huggingface to specify a different checkpoint

In [15]:
# Preprocess and classify the image using your custom model
# Map the model's output to human-readable category names

# Image captioning with BLIP/CLIP

# Can use saved model or download checkpoint from Salesforce.
#processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
#model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

local_model_dir = "./models"  # Local directory of model. 

# Load the model and processor from the local directory
processor = Blip2Processor.from_pretrained(local_model_dir)
model = Blip2ForConditionalGeneration.from_pretrained(local_model_dir)

text = "an image of"
model.to("cuda") # Use GPU

inputs = processor(images=image, return_tensors="pt")
generated_ids = model.generate(**inputs.to("cuda")) # Use GPU

caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Final Output

In [16]:

# Format the output
results = [
    f"Tesseract: {text_tesseract}",
    f"EasyOCR: {text_easyocr}",
    f"MobileNetV2(Categories): {predicted_category[0]} ({predicted_probabilities[0]:.2%})",
    f"MobileNetV2(Categories): {predicted_category[1]} ({predicted_probabilities[1]:.2%})",
    f"MobileNetV2(Categories): {predicted_category[2]} ({predicted_probabilities[2]:.2%})",
    f"EfficientNet(ImageNet): {efficientNet[0]})",
    f"EfficientNet(ImageNet): {efficientNet[1]})",
    f"EfficientNet(ImageNet): {efficientNet[2]})",
    f"BLIP: {caption}"
]

print(results)

['Tesseract: ', 'EasyOCR: ', 'MobileNetV2(Categories): All Beauty (68.95%)', 'MobileNetV2(Categories): Toys & Games (12.68%)', 'MobileNetV2(Categories): Sports & Outdoors (9.19%)', 'EfficientNet(ImageNet): stopwatch: 55.24%)', 'EfficientNet(ImageNet): magnetic_compass: 9.21%)', 'EfficientNet(ImageNet): analog_clock: 6.13%)', 'BLIP: a black and white watch sitting on a wooden table']
