# Homework 1 P3 and P4 for 20Ng dataset
### The below code imports the 20NG dataset, normalizes it and then splits into train, validation and test.

In [30]:
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

vectorizer = TfidfVectorizer(stop_words='english')

train_data_vector = vectorizer.fit_transform(newsgroups_train.data)
test_data_vector = vectorizer.fit_transform(newsgroups_test.data)

train_labels = newsgroups_train.target
test_labels = newsgroups_test.target

random_sample_indices = random.sample(range(train_data_vector.shape[0]), 3000)
train_data_20 = train_data_vector[random_sample_indices]
train_labels_20 = train_labels[random_sample_indices]

train_data_final_80, validation_data_final_10, train_labels_final_80,validation_labels_final_10 = train_test_split(train_data_20, train_labels_20, test_size=0.1, random_state=42)

### The below code calculates the euclidian distance between every newsgroup vector in the training set

In [31]:
from sklearn.metrics.pairwise import euclidean_distances
euclidian_distances_train = euclidean_distances(train_data_final_80)

### The below code calculates the euclidian distance between every newsgroup vector in the validation set

In [32]:
euclidian_distances_validation = euclidean_distances(validation_data_final_10)

### The below code calculates the euclidian distance between every newsgroup vector in the test set

In [33]:
euclidian_distances_test = euclidean_distances(test_data_vector)

### The below code calculates the cosine similarity between every newsgroup vector in the training set

In [34]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity_mat = cosine_similarity(train_data_final_80)

### The below code calculates the cosine similarity between every newsgroup vector in the validation set

In [35]:
cosine_similarity_mat = cosine_similarity(validation_data_final_10)

### The below code calculates the cosine similarity between every newsgroup vector in the test set

In [36]:
cosine_similarity_mat = cosine_similarity(test_data_vector)

In [37]:
print(train_data_final_80[4].shape[1])

129796


In [38]:
num_vectors = train_data_final_80.shape[0]
num_features = train_data_final_80[0].shape[1]

def calculate_edit_distance(image1, image2):
    differing_pixels = np.sum(np.abs(image1 - image2) >= 0.5)
    edit_distance = differing_pixels / num_features
    return edit_distance

edit_distance_mat = np.zeros((num_vectors, num_vectors))
for i in range(num_vectors):
  for j in range(num_vectors):
    distance = calculate_edit_distance(train_data_final_80[i], train_data_final_80[j])
    edit_distance_mat[i][j] = distance

In [43]:
print(edit_distance_mat)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  7.70439767e-06 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  7.70439767e-06 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  7.70439767e-06 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  7.70439767e-06 0.00000000e+00]
 [7.70439767e-06 7.70439767e-06 7.70439767e-06 ... 7.70439767e-06
  0.00000000e+00 7.70439767e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  7.70439767e-06 0.00000000e+00]]


### The below code is an implementation of KNN. It uses the precomputed distance matrix to predict labels.

In [39]:
from collections import Counter

class KNN:
  def __init__(self, k=20):
    self.k = k

  def fit(self, X, y): #X=news_data, y=news_label
    self.X_train = X
    self.y_train = y


  def predict(self, euclidian_dist_mat):
    predictions_list = []

    for i in range(len(euclidian_dist_mat)):
      k_indices = np.argsort(euclidian_dist_mat[i])[:self.k]
      k_nearest_labels = [self.y_train[i] for i in k_indices]
      mode = Counter(k_nearest_labels).most_common()
      predictions_list.append(mode[0][0])
    return predictions_list

### Training data on the implemented KNN and measuring accuracy

In [40]:
from sklearn.metrics import accuracy_score
clf = KNN(k=20)
clf.fit(train_data_final_80, train_labels_final_80)
prediction = clf.predict(euclidian_distances_train)
train_accuracy = accuracy_score(train_labels_final_80, prediction)
print("Train accuracy: ", train_accuracy)

Train accuracy:  0.8151851851851852


### Running KNN on the validation, and performing hyper-parameter tuning to find optimum k value.

In [41]:
#Performing hyper parameter tuning here to get an optimum k value..
optimum_k = -1
max_accuracy = 0
for i in range(10, 50):
    clf2 = KNN(k=10)
    clf2.fit(validation_data_final_10, validation_labels_final_10)
    prediction = clf2.predict(euclidian_distances_validation)
    validation_accuracy = accuracy_score(validation_labels_final_10, prediction)
    if validation_accuracy > max_accuracy:
            optimum_k = i
            max_accuracy = validation_accuracy

print("Optimum K: ", optimum_k)
print("validation accuracy: ", max_accuracy)

Optimum K:  10
validation accuracy:  0.7833333333333333


### Running KNN on test data and measuring accuracy

In [42]:
clf3 = KNN(k=10)
clf3.fit(test_data_vector, test_labels)
prediction3 = clf3.predict(euclidian_distances_test)
test_accuracy = accuracy_score(test_labels, prediction3)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.8648433351035582
