In [None]:
# Data Preparation and Understanding - Assignment 1
# [Your Name]
# [Date]

# Import necessary libraries
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import xml.etree.ElementTree as ET
from skimage import io, color, filters, feature, exposure
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import json

# Set up paths
IMAGE_DIR = r'C:\Users\David\programmingAssignment1\Images'
ANNOTATION_DIR = r'C:\Users\David\programmingAssignment1\Annotation'
OUTPUT_DIR = r'C:\Users\David\programmingAssignment1\output'

# Function to get bounding boxes from XML annotation
def get_bounding_boxes(annot):
    tree = ET.parse(annot)
    root = tree.getroot()
    objects = root.findall('object')
    bbox = []
    for o in objects:
        bndbox = o.find('bndbox')
        xmin = int(bndbox.find('xmin').text)
        ymin = int(bndbox.find('ymin').text)
        xmax = int(bndbox.find('xmax').text)
        ymax = int(bndbox.find('ymax').text)
        bbox.append((xmin,ymin,xmax,ymax))
    return bbox

print("Setup complete. Ready to start the assignment tasks.")

In [None]:
import os

def add_xml_extension(annotation_dir):
    """Rename annotation files to include the .xml extension."""
    for root, _, files in os.walk(annotation_dir):
        for filename in files:
            # Check if the file does not have an extension
            if '.' not in filename:
                old_path = os.path.join(root, filename)
                new_path = os.path.join(root, filename + '.xml')
                os.rename(old_path, new_path)

# Specify the path to your annotation directory
ANNOTATION_DIR = r'C:\Users\David\programmingAssignment1\Annotation'

# Add the .xml extension to all annotation files
add_xml_extension(ANNOTATION_DIR)


In [None]:
import os
from PIL import Image
import xml.etree.ElementTree as ET

def get_bounding_boxes_and_filename(annot):
    """Extract bounding box coordinates and filename from an XML annotation file."""
    try:
        tree = ET.parse(annot)
        root = tree.getroot()
        
        # Extract the <filename> value (e.g., n02085620_7)
        filename = root.find('filename').text

        # Extract bounding boxes from <object> tags
        bbox = []
        for obj in root.findall('object'):
            bndbox = obj.find('bndbox')
            xmin = int(bndbox.find('xmin').text)
            ymin = int(bndbox.find('ymin').text)
            xmax = int(bndbox.find('xmax').text)
            ymax = int(bndbox.find('ymax').text)
            bbox.append((xmin, ymin, xmax, ymax))
        
        return bbox, filename
    except Exception as e:
        print(f"Error parsing annotation file {annot}: {str(e)}")
        return [], None

def crop_and_resize_images(image_dir, annotation_dir, output_dir, target_size=(128, 128)):
    """Crop and resize images based on bounding boxes from XML annotations."""
    print(f"Starting image processing...")
    print(f"Image directory: {image_dir}")
    print(f"Annotation directory: {annotation_dir}")
    print(f"Output directory: {output_dir}")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    processed_count = 0
    xml_count = 0

    # Traverse subdirectories in the annotations folder
    for breed_dir in os.listdir(annotation_dir):
        breed_annotation_dir = os.path.join(annotation_dir, breed_dir)
        breed_image_dir = os.path.join(image_dir, breed_dir)

        # Check if both image and annotation subfolders exist
        if not os.path.isdir(breed_annotation_dir) or not os.path.isdir(breed_image_dir):
            continue

        print(f"\nProcessing breed: {breed_dir}")
        print(f"Annotation subfolder: {breed_annotation_dir}")
        print(f"Image subfolder: {breed_image_dir}")

        for filename in os.listdir(breed_annotation_dir):
            if filename.lower().endswith('.xml'):
                xml_count += 1
                annot_path = os.path.join(breed_annotation_dir, filename)
                print(f"Found annotation: {annot_path}")

                # Get bounding boxes and the filename from the XML file
                bbox, img_filename = get_bounding_boxes_and_filename(annot_path)
                
                # Append the .jpg extension to the filename extracted from XML
                img_filename += '.jpg'
                img_path = os.path.join(breed_image_dir, img_filename)

                # Check if the image file exists
                if not os.path.exists(img_path):
                    print(f"Image not found for annotation: {img_filename}")
                    continue
                
                print(f"Using image file: {img_path}")

                # Open the image
                img = Image.open(img_path)
                
                for i, (xmin, ymin, xmax, ymax) in enumerate(bbox):
                    try:
                        cropped_img = img.crop((xmin, ymin, xmax, ymax))
                        resized_img = cropped_img.resize(target_size, Image.LANCZOS)
                        
                        # Save to the output directory, preserving breed subfolder structure
                        output_subdir = os.path.join(output_dir, breed_dir)
                        if not os.path.exists(output_subdir):
                            os.makedirs(output_subdir)

                        output_filename = f"{os.path.splitext(img_filename)[0]}_{i}.jpg"
                        output_path = os.path.join(output_subdir, output_filename)
                        resized_img.save(output_path)
                        processed_count += 1
                        print(f"Saved: {output_path}")
                    except Exception as e:
                        print(f"Error processing bounding box {i} for {img_filename}: {str(e)}")
    
    print(f"\nProcessing completed. {processed_count} images saved to: {output_dir}")
    print(f"Total XML files found: {xml_count}")

# Use the function with the correct paths
IMAGE_DIR = r'C:\Users\David\programmingAssignment1\Images'
ANNOTATION_DIR = r'C:\Users\David\programmingAssignment1\Annotation'
OUTPUT_DIR = r'C:\Users\David\programmingAssignment1\output'

crop_and_resize_images(IMAGE_DIR, ANNOTATION_DIR, OUTPUT_DIR)


In [None]:
import os
from PIL import Image
import xml.etree.ElementTree as ET

def get_bounding_boxes_and_filename(annot):
    """Extract bounding box coordinates and the image filename from an XML annotation file."""
    try:
        tree = ET.parse(annot)
        root = tree.getroot()
        
        # Extract the <filename> value (e.g., n02085782_2)
        filename = root.find('filename').text

        # Extract bounding boxes from <object> tags
        bbox = []
        for obj in root.findall('object'):
            bndbox = obj.find('bndbox')
            xmin = int(bndbox.find('xmin').text)
            ymin = int(bndbox.find('ymin').text)
            xmax = int(bndbox.find('xmax').text)
            ymax = int(bndbox.find('ymax').text)
            bbox.append((xmin, ymin, xmax, ymax))
        
        return bbox, filename
    except Exception as e:
        print(f"Error parsing annotation file {annot}: {str(e)}")
        return [], None

def crop_and_resize_images(image_dir, annotation_dir, output_dir, target_size=(128, 128)):
    """Crop and resize images based on bounding boxes from XML annotations."""
    print(f"Starting image processing...")
    print(f"Image directory: {image_dir}")
    print(f"Annotation directory: {annotation_dir}")
    print(f"Output directory: {output_dir}")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    processed_count = 0
    xml_count = 0

    # Traverse subdirectories in the annotations folder
    for breed_dir in os.listdir(annotation_dir):
        breed_annotation_dir = os.path.join(annotation_dir, breed_dir)
        breed_image_dir = os.path.join(image_dir, breed_dir)

        # Check if both image and annotation subfolders exist
        if not os.path.isdir(breed_annotation_dir) or not os.path.isdir(breed_image_dir):
            continue

        print(f"\nProcessing breed: {breed_dir}")
        print(f"Annotation subfolder: {breed_annotation_dir}")
        print(f"Image subfolder: {breed_image_dir}")

        for filename in os.listdir(breed_annotation_dir):
            # Detect annotation files (assuming they have .xml extension)
            if filename.lower().endswith('.xml'):
                xml_count += 1
                annot_path = os.path.join(breed_annotation_dir, filename)
                print(f"Found annotation: {annot_path}")

                # Get bounding boxes and the filename from the XML file
                bbox, img_filename = get_bounding_boxes_and_filename(annot_path)
                
                # Append the .jpg extension to the filename extracted from XML
                img_filename += '.jpg'
                img_path = os.path.join(breed_image_dir, img_filename)

                # Check if the image file exists
                if not os.path.exists(img_path):
                    print(f"Image not found for annotation: {img_filename}")
                    continue
                
                print(f"Using image file: {img_path}")

                # Open the image
                img = Image.open(img_path)
                
                for i, (xmin, ymin, xmax, ymax) in enumerate(bbox):
                    try:
                        cropped_img = img.crop((xmin, ymin, xmax, ymax))
                        resized_img = cropped_img.resize(target_size, Image.LANCZOS)
                        
                        # Save to the output directory, preserving breed subfolder structure
                        output_subdir = os.path.join(output_dir, breed_dir)
                        if not os.path.exists(output_subdir):
                            os.makedirs(output_subdir)

                        output_filename = f"{os.path.splitext(img_filename)[0]}_{i}.jpg"
                        output_path = os.path.join(output_subdir, output_filename)
                        resized_img.save(output_path)
                        processed_count += 1
                        print(f"Saved: {output_path}")
                    except Exception as e:
                        print(f"Error processing bounding box {i} for {img_filename}: {str(e)}")
    
    print(f"\nProcessing completed. {processed_count} images saved to: {output_dir}")
    print(f"Total XML files found: {xml_count}")

# Use the function with the correct paths
IMAGE_DIR = r'C:\Users\David\programmingAssignment1\Images'
ANNOTATION_DIR = r'C:\Users\David\programmingAssignment1\Annotation'
OUTPUT_DIR = r'C:\Users\David\programmingAssignment1\output'

crop_and_resize_images(IMAGE_DIR, ANNOTATION_DIR, OUTPUT_DIR)


In [None]:
def create_edge_histogram(image_path, num_bins=36):
    img = io.imread(image_path, as_gray=True)
    edge_angles = filters.sobel_v(img)
    edge_angles = np.mod(np.arctan2(filters.sobel_v(img), filters.sobel_h(img)), np.pi)
    hist, _ = exposure.histogram(edge_angles, nbins=num_bins)
    return hist

def plot_image_with_histogram(image_path, hist):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Plot image
    img = io.imread(image_path)
    ax1.imshow(img)
    ax1.set_title('Original Image')
    ax1.axis('off')
    
    # Plot histogram
    ax2.bar(range(len(hist)), hist)
    ax2.set_xlabel('Bins')
    ax2.set_ylabel('Pixel Count')
    ax2.set_title('Edge Histogram')
    
    plt.tight_layout()
    plt.show()

def calculate_distances(hist1, hist2):
    euclidean = euclidean_distances([hist1], [hist2])[0][0]
    manhattan = manhattan_distances([hist1], [hist2])[0][0]
    cosine = cosine_distances([hist1], [hist2])[0][0]
    return euclidean, manhattan, cosine

# Example usage:
processed_dir = 'processed_images'
image_paths = [os.path.join(processed_dir, f) for f in os.listdir(processed_dir) if f.endswith('.jpg')][:4]

histograms = [create_edge_histogram(img_path) for img_path in image_paths]

for i, (img_path, hist) in enumerate(zip(image_paths, histograms)):
    print(f"Image {i+1}")
    plot_image_with_histogram(img_path, hist)

# Compare two histograms
hist1, hist2 = histograms[0], histograms[1]
euclidean, manhattan, cosine = calculate_distances(hist1, hist2)
print(f"Euclidean distance: {euclidean}")
print(f"Manhattan distance: {manhattan}")
print(f"Cosine distance: {cosine}")

In [None]:
import os
import matplotlib.pyplot as plt
from skimage import io, feature

def visualize_hog(image_path):
    image = io.imread(image_path, as_gray=True)
    fd, hog_image = feature.hog(image, orientations=8, pixels_per_cell=(16, 16),
                                cells_per_block=(1, 1), visualize=True)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=True)
    
    ax1.axis('off')
    ax1.imshow(image, cmap=plt.cm.gray)
    ax1.set_title('Input image')

    ax2.axis('off')
    ax2.imshow(hog_image, cmap=plt.cm.gray)
    ax2.set_title('HOG Visualization')
    
    plt.tight_layout()
    plt.show()

# Define the output directory where your images are saved
OUTPUT_DIR = r'C:\Users\David\programmingAssignment1\output'

# Create a list of paths for the cropped images
image_paths = []

# Traverse the output directory to find all saved images
for root, dirs, files in os.walk(OUTPUT_DIR):
    for file in files:
        if file.endswith('.jpg'):
            image_paths.append(os.path.join(root, file))

# Check the first few image paths
print("Sample image paths:", image_paths[:5])  # Print first 5 paths to verify

# Visualize HOG for one image (ensure image_paths is defined)
if image_paths:  # Check if the list is not empty
    visualize_hog(image_paths[0])
else:
    print("No images found in image_paths.")


In [None]:
def perform_pca_and_visualize(histograms, labels):
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(histograms)
    
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='viridis')
    plt.colorbar(scatter)
    plt.title('PCA of Edge Histograms')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.show()

# Assuming we have histograms for all images and their corresponding labels
all_histograms = [create_edge_histogram(img_path) for img_path in os.listdir(processed_dir) if img_path.endswith('.jpg')]
labels = [int(filename.split('_')[0]) for filename in os.listdir(processed_dir) if filename.endswith('.jpg')]

perform_pca_and_visualize(all_histograms, labels)

In [None]:
def load_and_process_text_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    texts = [item['text'] for item in data['train']]
    labels = [item['label'] for item in data['train']]
    return texts, labels

def vectorize_and_reduce(texts, vectorizer):
    vectors = vectorizer.fit_transform(texts)
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(vectors.toarray())
    return reduced_data

def plot_reduced_data(reduced_data, labels, title):
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='viridis')
    plt.colorbar(scatter)
    plt.title(title)
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.show()

# Load and process text data
texts, labels = load_and_process_text_data(TEXT_DATA_PATH)

# CountVectorizer
count_vectorizer = CountVectorizer()
count_reduced = vectorize_and_reduce(texts, count_vectorizer)
plot_reduced_data(count_reduced, labels, 'PCA of Count Vectors')

print(f"Dimensionality of Count Vectors: {count_vectorizer.get_feature_names_out().shape[0]}")

# TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_reduced = vectorize_and_reduce(texts, tfidf_vectorizer)
plot_reduced_data(tfidf_reduced, labels, 'PCA of TF-IDF Vectors')

print(f"Dimensionality of TF-IDF Vectors: {tfidf_vectorizer.get_feature_names_out().shape[0]}")

# Analysis of separability
def analyze_separability(reduced_data, labels):
    unique_labels = np.unique(labels)
    separable_count = 0
    for i in range(len(unique_labels)):
        for j in range(i+1, len(unique_labels)):
            class1 = reduced_data[np.array(labels) == unique_labels[i]]
            class2 = reduced_data[np.array(labels) == unique_labels[j]]
            if np.min(class1[:, 0]) > np.max(class2[:, 0]) or np.max(class1[:, 0]) < np.min(class2[:, 0]) or \
               np.min(class1[:, 1]) > np.max(class2[:, 1]) or np.max(class1[:, 1]) < np.min(class2[:, 1]):
                separable_count += 1
    return separable_count

print(f"Number of separable class pairs (Count Vectors): {analyze_separability(count_reduced, labels)}")
print(f"Number of separable class pairs (TF-IDF Vectors): {analyze_separability(tfidf_reduced, labels)}")