In [1]:
import os
import cv2
import numpy as np
from skimage.feature import hog
from skimage import exposure
from skimage.transform import rotate
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from skimage.feature import hog
from skimage import exposure
from skimage.color import rgb2gray
from skimage.transform import rotate
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer

  "class": algorithms.Blowfish,


In [2]:
# Function to extract HOG features from an image
def extract_hog_features(img):
    # Convert the image to grayscale if it's not already
    if img.ndim > 2:
        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    
    # Compute HOG features
    hog_features, _ = hog(img, orientations=8, pixels_per_cell=(8, 8),
                          cells_per_block=(2, 2), block_norm='L2-Hys', visualize=True)

    # Rescale intensity for better visibility of HOG features
    hog_features_rescaled = exposure.rescale_intensity(hog_features, in_range=(0, 10))

    return hog_features_rescaled.flatten()

# Function to extract color-based features (mean of RGB channels)
def extract_color_features(img):
    color_features = np.mean(img, axis=(0, 1))
    return color_features

# Data augmentation function
def augment_image(img):
    # Perform rotation by a random angle between -10 and 10 degrees
    angle = np.random.uniform(-10, 10)
    augmented_img = rotate(img, angle, mode='reflect', preserve_range=True).astype(np.uint8)
    return augmented_img


In [3]:
# Path to the dataset
data_dir = r"Images"  # Replace with the actual path

# Lists to store features and labels
features = []
labels = []

# Loop through each class (breed) in the dataset
for breed_folder in os.listdir(data_dir):
    breed_path = os.path.join(data_dir, breed_folder)
    
    # Loop through each image in the breed folder
    for img_name in os.listdir(breed_path):
        img_path = os.path.join(breed_path, img_name)
        
        # Read the image
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB
        
        # Resize the image (optional, but can be useful for consistency)
        img = cv2.resize(img, (128, 128))
        
        # Extract HOG features
        hog_features = extract_hog_features(img)
        
        # Extract color features
        color_features = extract_color_features(img)
        
        # Append features and labels to the lists
        features.append(np.concatenate((hog_features, color_features)))
        labels.append(breed_folder)
        
        # Augment the image and extract features from augmented image
        augmented_img = augment_image(img)
        augmented_hog_features = extract_hog_features(augmented_img)
        augmented_color_features = extract_color_features(augmented_img)
        
        # Append features and labels from augmented image
        features.append(np.concatenate((augmented_hog_features, augmented_color_features)))
        labels.append(breed_folder)

# Convert lists to NumPy arrays
features = np.array(features)
labels = np.array(labels)

In [4]:
# Shuffle the dataset
indices = np.arange(features.shape[0])
np.random.shuffle(indices)

features = features[indices]
labels = labels[indices]

In [5]:
# Encode the labels into numerical values
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [6]:
# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(features, encoded_labels, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
# Create logistic regression classifier with grid search for hyperparameter tuning
param_grid = {'C': [0.1, 1, 10], 'max_iter': [500, 1000, 2000]}
logreg_model = GridSearchCV(LogisticRegression(solver='lbfgs', multi_class='auto'), param_grid, cv=3)
logreg_model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = logreg_model.predict(X_val)

# Decode the predicted labels back to breed names
decoded_val_predictions = label_encoder.inverse_transform(y_val_pred)

# Make predictions on the test set
y_test_pred = logreg_model.predict(X_test)
# Decode the predicted labels back to breed names
decoded_test_predictions = label_encoder.inverse_transform(y_test_pred)
# Evaluate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}")

# Evaluate the accuracy on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}")

print("Best parameters found by grid search:", logreg_model.best_params_)

In [None]:
# Train the logistic regression model on the full training set
final_logreg_model = LogisticRegression(solver='lbfgs', multi_class='auto', **logreg_model.best_params_)
final_logreg_model.fit(X_train, y_train)

In [None]:
# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy * 100:.2f}")
print("Best parameters found by grid search:", logreg_model.best_params_)

In [None]:
# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Lists to store accuracy and iteration number ( number of times the algorithm goes through the entire dataset )
accuracies = []
iterations = []

# Vary the max_iter parameter
for max_iter in range(1, 101, 5):
    # Create logistic regression model
    model = LogisticRegression(max_iter=max_iter, solver='lbfgs', multi_class='auto', random_state=42)
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions on the validation set
    y_val_pred = model.predict(X_val_scaled)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_val_pred)
    
    # Store accuracy and iteration number
    accuracies.append(accuracy)
    iterations.append(max_iter)

# Plot the learning curve
plt.plot(iterations, accuracies, marker='o')
plt.title('Learning Curve for Logistic Regression')
plt.xlabel('Number of Iterations')
plt.ylabel('Validation Accuracy')
plt.grid(True)
plt.show()

In [None]:
# Create confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

In [None]:
# Calculate ROC curve for each class on the validation set
num_classes = len(np.unique(encoded_labels))
fpr_val = dict()
tpr_val = dict()
roc_auc_val = dict()

classifier = final_logreg_model

for i in range(num_classes):
    y_true = (y_val == i)
    y_score = classifier.predict_proba(X_val)[:, i]
    fpr_val[i], tpr_val[i], _ = roc_curve(y_true, y_score)
    roc_auc_val[i] = auc(fpr_val[i], tpr_val[i])

In [None]:
# Plot ROC curves on the validation set
plt.figure(figsize=(10, 8))
for i in range(num_classes):
    plt.plot(fpr_val[i], tpr_val[i], label=f'Class {i} (AUC = {roc_auc_val[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Each Class (Validation Set)')
plt.legend()
plt.show()

In [None]:
# Concatenate the features from training and testing sets
all_features = np.vstack([X_train, X_test])

# Choose the number of clusters (k)
k = 5  # You can adjust this based on your requirements

# Fit KMeans model
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(all_features)

# Get cluster labels for training and testing sets
train_cluster_labels = kmeans.predict(X_train)
test_cluster_labels = kmeans.predict(X_test)

# Display the cluster labels for a few samples
print("Training Set Cluster Labels:", train_cluster_labels[:10])
print("Testing Set Cluster Labels:", test_cluster_labels[:10])


In [None]:
# Concatenate features for training and testing sets
all_features = np.vstack([X_train, X_test])

# Fit PCA model
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(all_features)

# Add cluster labels to the PCA results
pca_df = pd.DataFrame(data={'PCA1': pca_result[:, 0], 'PCA2': pca_result[:, 1], 'Cluster': np.concatenate([train_cluster_labels, test_cluster_labels])})

# Plot the clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=pca_df, palette='viridis', alpha=0.7)
plt.title('K-Means Clustering Visualization')
plt.show()