In [1]:
import codecs
import numpy
import os
from tqdm import tqdm

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
path_to_data = os.environ.get("HW1_QUESTION3_DATASET_PATH")
mnist_files = os.listdir(path_to_data)
mnist_files = [x for x in mnist_files if x.endswith("ubyte")]


def convert_to_int(byte):
    integer = int(codecs.encode(byte, 'hex'), 16)
    return integer


dataset = {}
for file in mnist_files:
    print("Reading", file)
    with open(path_to_data + file, "rb") as f:
        data = f.read()
        type_of_data = convert_to_int(data[:4])
        length = convert_to_int(data[4:8])
        if type_of_data == 2051:
            category = "images"
            number_of_rows = convert_to_int(data[8:12])
            number_of_columns = convert_to_int(data[12:16])
            parsed = numpy.frombuffer(data, dtype=numpy.uint8, offset=16)
            parsed = parsed.reshape(length, number_of_rows, number_of_columns)
        if type_of_data == 2049:
            category = "labels"
            parsed = numpy.frombuffer(data, dtype=numpy.uint8, offset=8)
            parsed = parsed.reshape(length)
        if length == 60000:
            set = "train"
        if length == 10000:
            set = "test"
        dataset[set + '_' + category] = parsed

print(dataset["train_images"][0, :, :])

Reading t10k-images-idx3-ubyte
Reading t10k-labels-idx1-ubyte
Reading train-images-idx3-ubyte
Reading train-labels-idx1-ubyte
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   3  18  18  18 126 136
  175  26 166 255 247 127   0   0   0   0]
 [  0   0   0   0   0   0   0   0  30  36  94 154 170 253 253 253 253 253
  225 172 253 242 195  64   0   0   0   0]
 [  0   0   0   0   0   0   0  49 238 253 253 253 253 2

In [4]:
train_images = numpy.array(dataset['train_images'])
print(type(train_images))

<class 'numpy.ndarray'>


In [5]:
min_max_normalization_train_images = train_images/255
print(numpy.min(min_max_normalization_train_images))
print(numpy.max(min_max_normalization_train_images))

0.0
1.0


In [6]:
overall_mean = numpy.mean(train_images)
overall_std_dev = numpy.std(train_images)
z_normalization_train_images = (train_images - overall_mean) / overall_std_dev
print("Min:", numpy.min(z_normalization_train_images))
print("Max:", numpy.max(z_normalization_train_images))
print("Standard Deviation:", numpy.mean(z_normalization_train_images))
print("Variance", numpy.var(z_normalization_train_images))

Min: -0.424073894391566
Max: 2.821543345689335
Standard Deviation: -3.064638490070051e-17
Variance 0.9999999999999997


In [None]:
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

flattened_images = min_max_normalization_train_images.reshape(60000, -1)
n_samples = flattened_images.shape[0]
batch_size = 1000
pairwise_euclidean_matrix_mnist = numpy.zeros((n_samples, n_samples), dtype=numpy.float32)
pairwise_cosine_similarity_matrix_mnist = numpy.zeros((n_samples, n_samples), dtype=numpy.float32)

for i in range(0, n_samples, batch_size):
    end_i = min(i + batch_size, n_samples)

    current_batch = flattened_images[i:end_i]

    distances = pairwise_distances(current_batch, flattened_images, metric="euclidean")
    similarities = cosine_similarity(current_batch, flattened_images)

    pairwise_euclidean_matrix_mnist[i:end_i, :] = distances
    pairwise_cosine_similarity_matrix_mnist[i:end_i] = distances

print("First 5x5 of euclidean distance matrix:")
print(pairwise_euclidean_matrix_mnist[:5, :5])

print("First 5x5 of cosine similarity matrix:")
print(pairwise_cosine_similarity_matrix_mnist[:5, :5])

In [15]:
def self_implemented_pairwise_euclidean_dist_mnist(vector_data, batch_size=1000):

    self_implemented_pairwise_euclidean_dist_matrix_mnist = numpy.zeros((n_samples, n_samples), dtype=numpy.float32)

    squared_norm = numpy.sum(vector_data**2, axis=1, keepdims=True)

    for i in range(0, n_samples, batch_size):
        end_i = min(i+batch_size, n_samples)

        current_batch = vector_data[i:end_i]

        dot_prod = numpy.dot(current_batch, vector_data.T)

        dists = numpy.sqrt(squared_norm[i:end_i] + squared_norm.T - 2 * dot_prod)
        dists[dists < 0.0000001] = 0.0

        self_implemented_pairwise_euclidean_dist_matrix_mnist[i:end_i, :] = dists
    
    self_implemented_pairwise_euclidean_dist_matrix_mnist = numpy.nan_to_num(self_implemented_pairwise_euclidean_dist_matrix_mnist, num=0)
    return self_implemented_pairwise_euclidean_dist_matrix_mnist

self_impl_euc_dist_mat_mnist = self_implemented_pairwise_euclidean_dist_mnist(flattened_images)
print("First 5x5 of self implemented euclidean distance matrix:")
print(self_impl_euc_dist_mat_mnist[:5, :5])

  dists = numpy.sqrt(squared_norm[i:end_i] + squared_norm.T - 2 * dot_prod)


First 5x5 of self implemented euclidean distance matrix:
[[ 0.         9.361222  10.875094  10.018901  10.480264 ]
 [ 9.361222   0.        11.368364  10.849123  10.367248 ]
 [10.875094  11.368364         nan 10.084713  10.053022 ]
 [10.018901  10.849123  10.084713   0.         9.3679085]
 [10.480264  10.367248  10.053022   9.3679085  0.       ]]


<div class="alert alert-block alert-success">
20ng
</div>

In [23]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint

categories = ["alt.atheism", "sci.med", "sci.electronics", "comp.graphics", "talk.politics.guns", "sci.crypt"]
newsgroups_train = fetch_20newsgroups(subset="train", categories=categories)
print("Labels in the 20NG dataset.")
pprint(list(newsgroups_train.target_names))
print("\n\nNumber of datapoints and there labels obtained.")
print("\nNumber of files:")
pprint(newsgroups_train.filenames.shape)
print("\nNumber of target labels:")
pprint(newsgroups_train.target.shape)

print("\n\nExample file:")
pprint(newsgroups_train.filenames[0])
print("\nFirst 10 target labels:")
print(newsgroups_train.target[:10])

newsgroups_test = fetch_20newsgroups(subset="test", categories=categories)
print("\nTest SET")
print("Number of files in test dataset:", newsgroups_test.filenames.shape)
print("Number of target labels in test dataset:", newsgroups_test.target.shape)

# newsgroups = fetch_20newsgroups(subset="all", categories=categories)
# print("Total number of target labels in dataset:", newsgroups.target.shape)


Labels in the 20NG dataset.
['alt.atheism',
 'comp.graphics',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'talk.politics.guns']


Number of datapoints and there labels obtained.

Number of files:
(3390,)

Number of target labels:
(3390,)


Example file:
'/Users/ajeyk/scikit_learn_data/20news_home/20news-bydate-train/sci.electronics/53773'

First 10 target labels:
[3 1 1 4 1 1 1 1 4 3]

Test SET
Number of files in test dataset: (2257,)
Number of target labels in test dataset: (2257,)
Total number of target labels in dataset: (5647,)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)
# vectors = vectorizer.transform(newsgroups.data)
# print(vectors.shape)

(5647, 49878)


In [10]:
from sklearn.metrics import pairwise_distances

distance_matrix = pairwise_distances(vectors_train, metric="euclidean")
print("Shape of the pairwise Euclidean distance matrix:", distance_matrix.shape)
print("Top left 5x5 portion of the distance matrix:")
print(distance_matrix[:5, :5])

Shape of the pairwise Euclidean distance matrix: (5647, 5647)
Top left 5x5 portion of the distance matrix:
[[0.         1.38509798 1.40314855 1.40214168 1.39008564]
 [1.38509798 0.         1.3927232  1.40101979 1.33107971]
 [1.40314855 1.3927232  0.         1.3332657  1.37424215]
 [1.40214168 1.40101979 1.3332657  0.         1.39333289]
 [1.39008564 1.33107971 1.37424215 1.39333289 0.        ]]


In [10]:
def pairwise_euclidean_distance_matrix(matrix, batch_size=500):
    if hasattr(matrix, "toarray"):
        matrix = matrix.toarray()
    n_samples = matrix.shape[0]
    row_norms = numpy.sum(matrix**2, axis=1).reshape(-1, 1)
    distance_matrix_self_implemented = numpy.zeros((n_samples, n_samples), dtype=numpy.float64)
    for i in range(0, n_samples, batch_size):
        end_i = min(i + batch_size, n_samples)
        for j in (0, n_samples, batch_size):
            end_j = min(i + batch_size, n_samples)

            batch_i = matrix[i:end_i]
            batch_j = matrix[j:end_j]

            dot_product = numpy.dot(batch_i, batch_j.T)

            dq_sqr = row_norms[i:end_i] + row_norms[j:end_j].T - 2 * dot_product
            dq_sqr[dq_sqr < 0.000001] = 0.0
            dists = numpy.sqrt(dq_sqr)
            distance_matrix_self_implemented[i:end_i, j:end_j] = dists
    return distance_matrix_self_implemented

dense_vectors = vectors_train.toarray()
self_implemented_pairwise_euclidean_matrix = pairwise_euclidean_distance_matrix(dense_vectors, 1000)

print("Shape of pairwise Euclidean Matrix:", self_implemented_pairwise_euclidean_matrix.shape)
print("Top left 5x5 corner of self implemented pairwise Euclidean matrix")
print(self_implemented_pairwise_euclidean_matrix[:5, :5])

Shape of pairwise Euclidean Matrix: (5647, 5647)
Top left 5x5 corner of self implemented pairwise Euclidean matrix
[[0.         1.38509798 1.40314855 1.40214168 1.39008564]
 [1.38509798 0.         1.3927232  1.40101979 1.33107971]
 [1.40314855 1.3927232  0.         1.3332657  1.37424215]
 [1.40214168 1.40101979 1.3332657  0.         1.39333289]
 [1.39008564 1.33107971 1.37424215 1.39333289 0.        ]]


In [37]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity_matrix = cosine_similarity(vectors_train)
print("Cosine Similarity Matrix:")
print(cosine_similarity_matrix[:5, :5])

Cosine Similarity Matrix:
[[1.         0.02216726 0.01163281 0.01600166 0.01296295]
 [0.02216726 1.         0.03118734 0.04183591 0.01886625]
 [0.01163281 0.03118734 1.         0.05034892 0.00717534]
 [0.01600166 0.04183591 0.05034892 1.         0.0095818 ]
 [0.01296295 0.01886625 0.00717534 0.0095818  1.        ]]


In [None]:
def compute_cosine_similarity(data):
    dot_product = numpy.dot(data, data.T)

    norms = numpy.sqrt(numpy.sum(data**2, axis=1)).reshape(-1, 1)
    
    cosine_sim = dot_product / (norms @ norms.T)
    
    # cosine_sim[numpy.isnan(cosine_sim)] = 0.0
    
    return cosine_sim

self_implemented_pairwise_cosine_similarity_matrix = compute_cosine_similarity(vectors_train.toarray())
print("First 5x5 part of self implemented cosine similarity matrix:")
print(self_implemented_pairwise_cosine_similarity_matrix[:5, :5])

First 5x5 part of self implemented cosine similarity matrix:
[[1.         0.04075179 0.01558707 ... 0.01926414 0.02609016 0.04637655]
 [0.04075179 1.         0.03016105 ... 0.0859899  0.1117154  0.15201679]
 [0.01558707 0.03016105 1.         ... 0.02889556 0.06818502 0.14793447]
 ...
 [0.01926414 0.0859899  0.02889556 ... 1.         0.05342945 0.10152586]
 [0.02609016 0.1117154  0.06818502 ... 0.05342945 1.         0.09173395]
 [0.04637655 0.15201679 0.14793447 ... 0.10152586 0.09173395 1.        ]]


In [7]:
test_images = dataset['test_images']
print(type(test_images))
test_images = test_images/255
test_images = test_images.reshape(10000, -1)

test_n_samples = test_images.shape[0]
train_n_samples = flattened_images.shape[0]

def test_train_knn_mnist_euc_dist(test_dim, train_dim, batch_size=1000):
    mnist_knn_pairwise_euclidean_distance_matrix = numpy.zeros((test_dim, train_dim), dtype=numpy.float32)

    for i in range(0, test_dim, batch_size):
        end_i = min(i+batch_size, test_dim)
        for j in range(0, train_n_samples, batch_size):
            end_j = min(j+batch_size, train_n_samples)

            current_batch_test = test_images[i:end_i]
            current_batch_train = flattened_images[j:end_j]

            dists = pairwise_distances(current_batch_test, current_batch_train, metric="euclidean")

            mnist_knn_pairwise_euclidean_distance_matrix[i:end_i, j:end_j] = dists

    return mnist_knn_pairwise_euclidean_distance_matrix

mnist_knn_dist_matrix = test_train_knn_mnist_euc_dist(test_n_samples, train_n_samples)
print(mnist_knn_dist_matrix[:10, :5])

<class 'numpy.ndarray'>
[[ 9.395277  10.394629   9.440425  10.02592    9.196898 ]
 [11.182994  11.451037  11.656922  10.847431  11.536655 ]
 [ 9.223766  10.0762825  9.153183   6.558626   9.001019 ]
 [10.031458   7.7673883 11.85379   12.15051   10.628685 ]
 [10.592882  10.964605   7.7298265  9.96961    9.697421 ]
 [ 9.403014  10.470196   9.846221   6.1866245  8.939653 ]
 [10.391592  11.739411   9.568376  10.289397   8.41528  ]
 [10.007462  11.242873   9.632654   9.458699   8.789304 ]
 [ 9.866912  10.861721  10.731546  10.592133  10.35108  ]
 [10.525787  10.245898  10.632269  10.113032   8.297613 ]]


In [22]:
total = test_images.shape[0]
correct = 0
k = 6
train_labels = dataset['train_labels']
test_labels = dataset['test_labels']

for i, each_test_image in enumerate(mnist_knn_dist_matrix):
    closest_indexes = numpy.argpartition(each_test_image, k-1)[:k]
    closest_values = numpy.array([train_labels[x] for x in closest_indexes])
    val, counts = numpy.unique(closest_values, return_counts=True)
    prediction = val[numpy.argmax(counts)]
    if prediction == test_labels[i]:
        correct += 1

acc = correct / total * 100
print(f"Accuracy: {acc: .2f}")

Accuracy:  96.77


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

train_labels = dataset['train_labels']
test_labels = dataset['test_labels']
knn = KNeighborsClassifier(n_neighbors=6, metric='euclidean')

knn.fit(flattened_images, train_labels)

# Step 3: Predict on the test set
predicted_labels = knn.predict(test_images)

# Step 4: Calculate accuracy
accuracy = accuracy_score(test_labels, predicted_labels)

print(f"KNN Accuracy: {accuracy * 100:.2f}%")

KNN Accuracy: 96.77%


In [35]:
test_train_knn_20ng_cosine_similarity_matrix = cosine_similarity(vectors_test, vectors_train)
print("Cosine similarity matrix between test and train dataset:")
print(test_train_knn_20ng_cosine_similarity_matrix[:5, :5])

Cosine similarity matrix between test and train dataset:
[[0.03978613 0.0739227  0.06952419 0.07653471 0.01990531]
 [0.03286828 0.0810762  0.09243659 0.0823853  0.01531062]
 [0.0124257  0.02517339 0.01418638 0.01043689 0.0092545 ]
 [0.01810955 0.03236013 0.03049451 0.04241682 0.00600721]
 [0.10972774 0.053372   0.05042398 0.0522294  0.02154505]]


In [45]:
test_train_knn_20ng_cosine_DISTANCE_matrix = numpy.clip(1 - test_train_knn_20ng_cosine_similarity_matrix, a_min=0, a_max=None)

# Step 3: Initialize the KNN classifier
knn_20ng = KNeighborsClassifier(n_neighbors=6, metric='precomputed')

# Step 4: Train KNN on precomputed distance matrix
# Use cosine distances between the training set and itself for fitting
train_self_distances = numpy.clip(1 - cosine_similarity_matrix, a_min=0, a_max=None)
knn_20ng.fit(train_self_distances, newsgroups_train.target)

# Step 5: Predict using precomputed test-train distances
predictions = knn.predict(test_train_knn_20ng_cosine_DISTANCE_matrix)

# Step 6: Calculate accuracy
accuracy = accuracy_score(newsgroups_test.target, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [42]:
knn_20ng_1 = KNeighborsClassifier(n_neighbors=6, metric="cosine")

knn_20ng_1.fit(vectors_train, newsgroups_train.target)

predicted_labels1 = knn_20ng_1.predict(vectors_test)

accuracy1 = accuracy_score(newsgroups_test.target, predicted_labels1)

print(f"KNN Accuracy: {accuracy1 * 100:.2f}%")



KNN Accuracy: 76.12%


In [48]:
k = 6
total = vectors_test.shape[0]
correct = 0

for i, row in enumerate(test_train_knn_20ng_cosine_similarity_matrix):
    closest_indices = numpy.argpartition(-row, k-1)[:k]  # Negative for descending order
    closest_labels = numpy.array([newsgroups_train.target[x] for x in closest_indices])
    val, counts = numpy.unique(closest_labels, return_counts=True)
    prediction = val[numpy.argmax(counts)]
    if prediction == newsgroups_test.target[i]:
        correct += 1
    
accuracy_20ng = correct / total * 100
print(f"KNN accuracy for 20NG dataset using cosine similarity: {accuracy_20ng:.2f}%")


KNN accuracy for 20NG dataset using cosine similarity: 76.12%
