Welcome to assignment 1.                                                       

We are using pathology images for our first assignment please download data from this link https://drive.google.com/drive/folders/10dUOzcPR-PQwfFYcHk5gsLjIjSorQ32Q?usp=sharing



# Task 1: Feature Generation (15%)
# Use and run the following code (a deep network) to generate features from a set of training images. For this assignment, you do not need to know how the deep network is working here to extract features.
# This code extracts the features of image T4.tif (in the T folder of dataset). Modify the code so that it iterates over all images of the dataset and extracts their features.
# Allocate 10% of the data for validation.

# Insert your code here for Task 1





In [1]:
import os
import random
import torch
import torchvision.transforms as transforms
from torchvision.models import densenet121
from torch.autograd import Variable
from PIL import Image
from sklearn.model_selection import train_test_split

# Set the path to the dataset folder — modify to match the local dowload location of the dataset on your machine.
dataset_path = "train"

# List to store image paths and labels
all_image_paths = []
all_labels = []

# Iterate over labeled folders (A to T)
for label in os.listdir(dataset_path):
    label_folder = os.path.join(dataset_path, label)
    
    # Iterate over images in each labeled folder
    for image_name in os.listdir(label_folder):
        image_path = os.path.join(label_folder, image_name)
        all_image_paths.append(image_path)
        all_labels.append(label)

# Split the data into training and validation sets (90% training, 10% validation)
train_image_paths, val_image_paths, train_labels, val_labels = train_test_split(
    all_image_paths, all_labels, test_size=0.1, random_state=42
)

# Load pre-trained DenseNet model
model = densenet121(pretrained=True)

# Remove the classification layer (last fully connected layer)
model = torch.nn.Sequential(*list(model.children())[:-1])

# Add a global average pooling layer
model.add_module('global_avg_pool', torch.nn.AdaptiveAvgPool2d(1))

# Set the model to evaluation mode
model.eval()

# Define the image preprocessing pipeline
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to extract features for a given image path
def extract_features(image_path):
    image = Image.open(image_path)
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)
    input_var = Variable(input_batch)
    features = model(input_var)
    feature_vector = features.squeeze().detach().numpy()
    return feature_vector


# Extract features for training set
train_features = [extract_features(image_path) for image_path in train_image_paths]

# Extract features for validation set
val_features = [extract_features(image_path) for image_path in val_image_paths]

# Now 'train_features' and 'val_features' contain the features from the last fully connected layer of DenseNet
print("Training set feature vectors shape:", len(train_features))
print("Validation set feature vectors shape:", len(val_features))

# Print the first few feature vectors and labels for training set
print("Training set feature vectors:")
for i in range(min(5, len(train_features))):
    print(f"Instance {i+1}: {train_features[i]} - Label: {train_labels[i]}")

# Print the first few feature vectors and labels for validation set
print("\nValidation set feature vectors:")
for i in range(min(5, len(val_features))):
    print(f"Instance {i+1}: {val_features[i]} - Label: {val_labels[i]}")



Training set feature vectors shape: 702
Validation set feature vectors shape: 78
Training set feature vectors:
Instance 1: [ 5.6815566e-04  5.7970881e-03  2.6177713e-03 ...  5.1698887e-01
  4.0688884e-01 -5.7975239e-01] - Label: D
Instance 2: [ 4.7536640e-04  1.0388358e-02  2.8466193e-03 ...  1.4883754e+00
 -1.6424663e-02 -6.4177150e-01] - Label: A
Instance 3: [ 5.2169157e-04  1.2616204e-02  1.1426058e-04 ... -2.1618327e-01
 -2.1554305e-01 -2.0022346e-01] - Label: I
Instance 4: [ 9.1784765e-05  7.4622752e-03  1.3906002e-03 ...  8.3241981e-01
 -6.6252053e-01 -1.7127241e-01] - Label: L
Instance 5: [ 8.1051076e-05  1.0962947e-03  5.3642306e-04 ... -4.1521978e-01
  9.8362891e-03  1.8082334e-02] - Label: E

Validation set feature vectors:
Instance 1: [ 2.95502279e-04  2.00712564e-03 -1.01288955e-04 ... -2.95696110e-01
  4.95025903e-01  1.83987677e-01] - Label: E
Instance 2: [0.00052962 0.00701606 0.00043407 ... 0.20975111 0.01167713 0.289521  ] - Label: E
Instance 3: [ 4.0282198e-04  4.8080

# Task 2: High Bias Classification Method (5%)
# Choose a classification method and let is have a high bias.
# Train it on the generated features and discuss why it is underfitting.

# Insert your code here for Task 2




In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Assuming 'train_features' and 'train_labels' are the features and labels for training set
# Assuming 'val_features' and 'val_labels' are the features and labels for validation set

# Convert labels to integers (assuming labels are strings)
label_to_int = {label: idx for idx, label in enumerate(set(train_labels))}
train_labels_int = [label_to_int[label] for label in train_labels]
val_labels_int = [label_to_int[label] for label in val_labels]

# Use K-Means for classification
kmeans = KMeans(n_clusters=len(set(train_labels)), random_state=42)
kmeans.fit(train_features)

# Predict cluster assignments for validation set
val_predictions = kmeans.predict(val_features)

# Convert cluster assignments to labels
cluster_to_label = {cluster: label for label, cluster in label_to_int.items()}
val_labels_pred = [cluster_to_label[cluster] for cluster in val_predictions]

#print(val_labels_int, val_predictions)

# Calculate accuracy
accuracy = accuracy_score(val_labels_int, val_predictions)

# Discuss why it might be underfitting
print(f"Accuracy: {accuracy:.2%}")
print("K-Means is a simple algorithm that assumes spherical clusters with equal variance.")
print("It may underfit when the underlying data distribution is non-linear or has varying cluster shapes.")


Accuracy: 0.00%
K-Means is a simple algorithm that assumes spherical clusters with equal variance.
It may underfit when the underlying data distribution is non-linear or has varying cluster shapes.


# Task 3: High Variance Classification Method (5%)
# Use the chosen classification method and let it have a high variance.
# Train it on the generated features and discuss why it is overfitting.

# Insert your code here for Task 3




# Task 4: Balanced Classification Method (15%)
# Use the chosen classification method and let it balance the bias and variance.
# Train it on the generated features, possibly adjusting parameters.
# Discuss insights into achieving balance.

# Insert your code here for Task 4




# Task 5: K-Means Clustering (20%)
# Apply K-Means clustering on the generated features.
# Test with available labels and report accuracy.
# Experiment with automated K and compare with manually set 20 clusters.

# Insert your code here for Task 5




In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import homogeneity_completeness_v_measure

letter_to_int = {
    "A": 0,
    "B": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "J": 9,
    "K": 10,
    "L": 11,
    "M": 12,
    "N": 13,
    "O": 14,
    "P": 15,
    "Q": 16,
    "R": 17,
    "S": 18,
    "T": 19
}

def print_cluster_results(homogeneity_completeness_v_score):
    print("The homogeneity is " + str(round(homogeneity_completeness_v_score[0], 2)))
    print("The completeness is " + str(round(homogeneity_completeness_v_score[1], 2)))
    print("The v_score is " + str(round(homogeneity_completeness_v_score[2], 2)))

# Run KMeans algorithm with 20 clusters as we are have 20 classes
kmeans = KMeans(n_clusters=20, random_state=0, n_init="auto").fit(train_features)
predicitions = kmeans.predict(train_features)

int_train_labels = [letter_to_int[label] for label in train_labels]

homogeneity_completeness_v_score = homogeneity_completeness_v_measure(predicitions, int_train_labels)

print("Using KMeans where k=20:")
print_cluster_results(homogeneity_completeness_v_score)
print()


# Allow KMeans algorithm to automatically determine the number of classes
kmeans = KMeans(random_state=0, n_init="auto").fit(train_features)
predicitions = kmeans.predict(train_features)

int_train_labels = [letter_to_int[label] for label in train_labels]

homogeneity_completeness_v_score = homogeneity_completeness_v_measure(predicitions, int_train_labels)

print("Using automated KMeans where k=" + str(max(predicitions)) + ":")
print_cluster_results(homogeneity_completeness_v_score)


Using KMeans where k=20:
The homogeneity is 0.84
The completeness is 0.79
The v_score is 0.82

Using automated KMeans where k=7:
The homogeneity is 0.88
The completeness is 0.56
The v_score is 0.69


# Task 6: Additional Clustering Algorithm (10%)
# Choose another clustering algorithm and apply it on the features.
# Test accuracy with available labels.

# Insert your code here for Task 6




In [4]:
from sklearn.cluster import DBSCAN
from sklearn.metrics.cluster import homogeneity_completeness_v_measure

letter_to_int = {
    "A": 0,
    "B": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "J": 9,
    "K": 10,
    "L": 11,
    "M": 12,
    "N": 13,
    "O": 14,
    "P": 15,
    "Q": 16,
    "R": 17,
    "S": 18,
    "T": 19
}

def print_cluster_results(homogeneity_completeness_v_score):
    print("The homogeneity is " + str(round(homogeneity_completeness_v_score[0], 2)))
    print("The completeness is " + str(round(homogeneity_completeness_v_score[1], 2)))
    print("The v_score is " + str(round(homogeneity_completeness_v_score[2], 2)))

MAXIMUM_RADIUS = 12
MINIMUM_SAMPLES = 2

# Run DBSCAN algorithm
clusters = DBSCAN(eps=12, min_samples=2).fit(train_features)
predicitions = clusters.labels_

int_train_labels = [letter_to_int[label] for label in train_labels]

homogeneity_completeness_v_score = homogeneity_completeness_v_measure(predicitions, int_train_labels)

print("Using automated DBSCAN with maxmum radius of " + str(MAXIMUM_RADIUS) + \
    " and minimum samples per cluster of " + str(MINIMUM_SAMPLES) + " images")
print_cluster_results(homogeneity_completeness_v_score)


Using automated DBSCAN with maxmum radius of 12 and minimum samples per cluster of 2 images
The homogeneity is 0.65
The completeness is 0.32
The v_score is 0.43


# Task 7: PCA for Classification Improvement (20%)
# Apply PCA on the features and then feed them to the best classification method in the above tasks.
# Assess if PCA improves outcomes and discuss the results.

# Insert your code here for Task 7




# Task 8: Visualization and Analysis (10%)
# Plot the features in a lower dimension using dimentinality reduction techniques.
# Analyze the visual representation, identifying patterns or insights.

# Insert your code here for Task 8