In [1]:
import sklearn.datasets as skd
import pandas as pd
import numpy as np

In [2]:
dataset = skd.fetch_20newsgroups()

In [3]:
idx = np.random.randint(0, len(dataset.target), 2000)
idx_test = idx[0:1000]
idx_train = idx[1000:2000]
train, test = [dataset.data[idx] for idx in idx_train], [dataset.data[idx] for idx in idx_test]
train_target, test_target = [dataset.target[idx] for idx in idx_train], [dataset.target[idx] for idx in idx_test]

In [4]:
import sklearn.feature_extraction.text as txt

hashing = txt.HashingVectorizer()
vec_train = hashing.fit_transform(train)
vec_test = hashing.transform(test)

In [5]:
tfid = txt.TfidfTransformer()
tfid_train = tfid.fit_transform(vec_train)
tfid_test = tfid.transform(vec_test)

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def nontfid_acc_for_metric(metric_to_test):
    neigh = KNeighborsClassifier(metric=metric_to_test)
    neigh.fit(vec_train, train_target)

    test_predict = neigh.predict(vec_test)
    print("nontfid acc: %lf"%(accuracy_score(test_target, test_predict)))

In [7]:
def tfid_acc_for_metric(metric_to_test):
    neigh = KNeighborsClassifier(metric=metric_to_test)
    neigh.fit(tfid_train, train_target)

    test_predict = neigh.predict(tfid_test)
    print("tfid acc: %lf"%(accuracy_score(test_target, test_predict)))

In [8]:
metrics = ['euclidean', 'manhattan', 'minkowski']

for metric in metrics:
    print(metric)
    nontfid_acc_for_metric(metric)
    tfid_acc_for_metric(metric)

euclidean
nontfid acc: 0.315000
tfid acc: 0.561000
manhattan
nontfid acc: 0.272000
tfid acc: 0.170000
minkowski
nontfid acc: 0.315000
tfid acc: 0.561000
