In [None]:
# Course: CSC 2611-131 AI Tools
# Fall 2024
# Final Project – Dog Classifier
# Name: Theresa Kettner
# Created: 12/05/2024

import os
import numpy as np
from PIL import Image
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

def load_image_dataset(data_path, subset='train'):
    """
    Load images from the specified directory and convert to feature vectors.
    
    Args:
        data_path: Base path to data directory
        subset: One of 'train', 'test', or 'valid'
    
    Returns:
        features: numpy array of flattened image data
        labels: numpy array of class labels
    """
    subset_path = os.path.join(data_path, subset)
    features = []
    labels = []
    
    # Get list of dog breeds (subdirectories)
    breeds = os.listdir(subset_path)
    
    # Load images for each breed
    for breed in tqdm(breeds, desc=f'Loading {subset} data'):
        breed_path = os.path.join(subset_path, breed)
        if not os.path.isdir(breed_path):
            continue
            
        for img_name in os.listdir(breed_path):
            img_path = os.path.join(breed_path, img_name)
            try:
                # Load image in color mode (RGB)
                img = Image.open(img_path).convert('RGB')
                # Ensure image is 224x224
                img = img.resize((224, 224))
                # Convert to numpy array
                img_array = np.array(img)
                
                # Extract color features
                # Average color per channel
                color_means = img_array.mean(axis=(0,1))
                # Standard deviation of colors
                color_stds = img_array.std(axis=(0,1))
                # Color histograms for each channel
                hist_r = np.histogram(img_array[:,:,0], bins=32, range=(0,256))[0]
                hist_g = np.histogram(img_array[:,:,1], bins=32, range=(0,256))[0]
                hist_b = np.histogram(img_array[:,:,2], bins=32, range=(0,256))[0]
                
                # Combine features
                features_combined = np.concatenate([
                    color_means,  # 3 features
                    color_stds,   # 3 features
                    hist_r,       # 32 features
                    hist_g,       # 32 features
                    hist_b        # 32 features
                ])
                
                features.append(features_combined)
                labels.append(breed)
            except Exception as e:
                print(f"Error loading {img_path}: {e}")
    
    return np.array(features), np.array(labels)

# Load datasets
data_path = 'C:\\00\\70-dog-breedsimage-data-set-updated'
X_train, y_train = load_image_dataset(data_path, 'train')
X_valid, y_valid = load_image_dataset(data_path, 'valid')
X_test, y_test = load_image_dataset(data_path, 'test')

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_valid.shape}")
print(f"Test set shape: {X_test.shape}")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Try different k values
k_values = [1, 3, 5, 7, 11]
best_k = None
best_accuracy = 0

for k in k_values:
    print(f"\nTraining with k={k}")
    knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn.fit(X_train_scaled, y_train)
    
    # Evaluate on validation set
    valid_accuracy = knn.score(X_valid_scaled, y_valid)
    print(f"Validation accuracy: {valid_accuracy:.3f}")
    
    if valid_accuracy > best_accuracy:
        best_accuracy = valid_accuracy
        best_k = k

print(f"\nBest k value: {best_k}")

# Train final model with best k
final_knn = KNeighborsClassifier(n_neighbors=best_k, n_jobs=-1)
final_knn.fit(X_train_scaled, y_train)

# Evaluate on test set
test_predictions = final_knn.predict(X_test_scaled)
print("\nTest Set Performance:")
print(classification_report(y_test, test_predictions))

Loading train data: 100%|██████████| 40/40 [00:24<00:00,  1.65it/s]
Loading valid data: 100%|██████████| 40/40 [00:02<00:00, 19.61it/s]
Loading test data: 100%|██████████| 40/40 [00:02<00:00, 16.07it/s]


Training set shape: (4600, 102)
Validation set shape: (400, 102)
Test set shape: (400, 102)

Training with k=1
Validation accuracy: 0.210

Training with k=3
Validation accuracy: 0.163

Training with k=5
Validation accuracy: 0.145

Training with k=7
Validation accuracy: 0.128

Training with k=11
Validation accuracy: 0.110

Best k value: 1

Test Set Performance:
                   precision    recall  f1-score   support

           Afghan       0.29      0.20      0.24        10
American Hairless       0.50      0.30      0.37        10
 American Spaniel       0.20      0.20      0.20        10
           Basset       0.08      0.10      0.09        10
           Beagle       0.50      0.60      0.55        10
       Bloodhound       0.00      0.00      0.00        10
    Border Collie       0.44      0.70      0.54        10
   Boston Terrier       0.50      0.20      0.29        10
            Boxer       0.40      0.20      0.27        10
     Bull Terrier       0.60      0.30      0.