In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import cv2
import numpy as np
from tqdm import tqdm

**LOADING DATASET**

In [3]:
DATASET_PATH = '/content/drive/My Drive/Dataset'

In [4]:
class_folders = os.listdir(DATASET_PATH)
print(f"Found folders: {class_folders}")

for folder in class_folders:
  folder_path = os.path.join(DATASET_PATH, folder)

  if os.path.isdir(folder_path):
    count = len(os.listdir(folder_path))
    print(f"{folder}: {count} images")

Found folders: ['INDIA50OLD', 'INDIA200', 'INDIA2000', 'INDIA500', 'INDIA50NEW', 'INDIA10OLD', 'INDIA10NEW', 'INDIA100NEW', 'INDIA20', 'INDIA100OLD']
INDIA50OLD: 200 images
INDIA200: 200 images
INDIA2000: 200 images
INDIA500: 200 images
INDIA50NEW: 200 images
INDIA10OLD: 200 images
INDIA10NEW: 200 images
INDIA100NEW: 200 images
INDIA20: 200 images
INDIA100OLD: 200 images


**PREPROCESSING**

In [5]:
original_image_paths = []
original_labels = []

for folder in class_folders:
    folder_path = os.path.join(DATASET_PATH, folder)
    if not os.path.isdir(folder_path):
        continue

    label = folder
    folder_images = os.listdir(folder_path)

    for filename in folder_images:
        img_path = os.path.join(folder_path, filename)
        original_image_paths.append(img_path)
        original_labels.append(label)

print(f"Total original images: {len(original_image_paths)}")

Total original images: 2000


**FEATURE EXTRACTION**

In [6]:
from sklearn.model_selection import train_test_split

X_paths_train_orig, X_paths_test_orig, y_train, y_test = train_test_split(
    original_image_paths,
    original_labels,
    test_size=0.2,
    random_state=42,
    stratify=original_labels)

X_paths_train_final = [(path, 'original') for path in X_paths_train_orig]
X_paths_test_final = [(path, 'original') for path in X_paths_test_orig]

In [9]:
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans

sift = cv2.SIFT_create()
orb = cv2.ORB_create(nfeatures=200)

VOCAB_SIZE_SIFT = 25000
VOCAB_SIZE_ORB = 25000
IMG_SIZE = 150

In [10]:
def improved_preprocess_image(image_path, img_size=150):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    resized = cv2.resize(image, (img_size, img_size))
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
    return clahe.apply(resized)

In [11]:
def extract_descriptors(image_paths, sift_detector, orb_detector, max_desc=100):

    sift_descriptors = []
    orb_descriptors = []

    for img_path, aug_type in tqdm(image_paths, desc='Extract descriptors'):
        try:
            image = improved_preprocess_image(img_path[0], IMG_SIZE) if isinstance(img_path, tuple) else improved_preprocess_image(img_path, IMG_SIZE)
            if image is None:
                continue

            # SIFT descriptors
            kp_sift, des_sift = sift_detector.detectAndCompute(image, None)
            if des_sift is not None and len(des_sift) > max_desc:
                des_sift = des_sift[:max_desc]
            if des_sift is not None:
                sift_descriptors.extend(des_sift)

            # ORB descriptors
            kp_orb, des_orb = orb_detector.detectAndCompute(image, None)
            if des_orb is not None and len(des_orb) > max_desc:
                des_orb = des_orb[:max_desc]
            if des_orb is not None:
                orb_descriptors.extend(des_orb)

        except Exception as e:
            print(f"Error extracting from {img_path}: {e}")

    return np.array(sift_descriptors, dtype=np.float32), np.array(orb_descriptors, dtype=np.uint8)

In [12]:
sift_desc, orb_desc = extract_descriptors(X_paths_train_final, sift, orb)

print(f"SIFT descriptors shape: {sift_desc.shape}")
print(f"ORB descriptors shape: {orb_desc.shape}")

sift_kmeans = MiniBatchKMeans(n_clusters=VOCAB_SIZE_SIFT, random_state=42, batch_size=2048)
sift_kmeans.fit(sift_desc)

orb_kmeans = MiniBatchKMeans(n_clusters=VOCAB_SIZE_ORB, random_state=42, batch_size=2048)
orb_kmeans.fit(orb_desc)

Extract descriptors: 100%|██████████| 1600/1600 [09:25<00:00,  2.83it/s]


SIFT descriptors shape: (159983, 128)
ORB descriptors shape: (159736, 32)


In [13]:
def create_combined_histogram(image, sift_vocab, orb_vocab, sift_detector, orb_detector):

    hist_sift = np.zeros(len(sift_vocab.cluster_centers_))
    hist_orb = np.zeros(len(orb_vocab.cluster_centers_))
    kp_sift, des_sift = sift_detector.detectAndCompute(image, None)
    if des_sift is not None and len(des_sift) > 0:
        des_sift = des_sift.astype(np.float32)
        words = sift_vocab.predict(des_sift)
        for w in words:
            hist_sift[w] += 1
    kp_orb, des_orb = orb_detector.detectAndCompute(image, None)
    if des_orb is not None and len(des_orb) > 0:
        words = orb_vocab.predict(des_orb)
        for w in words:
            hist_orb[w] += 1
    return np.concatenate([hist_sift, hist_orb])

In [14]:
X_train_features = []

for img_path, aug_type in tqdm(X_paths_train_final, desc="Creating training histograms"):
    img = improved_preprocess_image(img_path, IMG_SIZE)
    h = create_combined_histogram(img, sift_kmeans, orb_kmeans, sift, orb)
    X_train_features.append(h)

X_test_features = []

for img_path, aug_type in tqdm(X_paths_test_final, desc="Creating test histograms"):
    img = improved_preprocess_image(img_path, IMG_SIZE)
    h = create_combined_histogram(img, sift_kmeans, orb_kmeans, sift, orb)
    X_test_features.append(h)

X_train = np.array(X_train_features)
X_test = np.array(X_test_features)

print(f"Training features shape (SIFT+ORB fusion): {X_train.shape}")
print(f"Test features shape (SIFT+ORB fusion): {X_test.shape}")

pca = PCA(n_components=0.95, svd_solver='full', random_state=42)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

print(f"Feature shapes after PCA: Train {X_train.shape}, Test {X_test.shape}")

Creating training histograms: 100%|██████████| 1600/1600 [03:22<00:00,  7.90it/s]
Creating test histograms: 100%|██████████| 400/400 [02:46<00:00,  2.40it/s]


Training features shape (SIFT+ORB fusion): (1600, 50000)
Test features shape (SIFT+ORB fusion): (400, 50000)
Feature shapes after PCA: Train (1600, 1444), Test (400, 1444)


**ML MODELLING**

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    'C': [1, 10, 50, 100],
    'kernel': ['linear', 'rbf'],
    }

svc = SVC(probability=True, random_state=42)

grid_search = GridSearchCV(svc, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

print("Best hyperparameters found:", grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best hyperparameters found: {'C': 1, 'kernel': 'linear'}


In [17]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\n Accuracy: {accuracy * 100:.2f}%")
print(f"\nF1 Score: {f1 * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))


 Accuracy: 90.25%

F1 Score: 90.26%

Classification Report:
              precision    recall  f1-score   support

 INDIA100NEW       0.90      0.90      0.90        40
 INDIA100OLD       0.86      0.93      0.89        40
  INDIA10NEW       0.92      0.90      0.91        40
  INDIA10OLD       0.85      0.88      0.86        40
     INDIA20       0.84      0.93      0.88        40
    INDIA200       0.88      0.93      0.90        40
   INDIA2000       0.97      0.85      0.91        40
    INDIA500       0.92      0.90      0.91        40
  INDIA50NEW       0.97      0.85      0.91        40
  INDIA50OLD       0.93      0.97      0.95        40

    accuracy                           0.90       400
   macro avg       0.91      0.90      0.90       400
weighted avg       0.91      0.90      0.90       400



**SAVING THE MODEL**

In [18]:
import joblib
import pickle

In [19]:
# Saving SVM Model
joblib.dump(best_model, 'sift_orb_svm_model.pkl')


# Saving Vocabularies
with open('sift_vocab.pkl', 'wb') as f:
    pickle.dump(sift_kmeans, f)

with open('orb_vocab.pkl', 'wb') as f:
    pickle.dump(orb_kmeans, f)


# Saving PCA & Scaler Objects
joblib.dump(pca, 'pca_model.pkl')
joblib.dump(scaler, 'scaler_model.pkl')

print("✅ All files saved successfully.")

✅ All files saved successfully.
