In [3]:
import numpy as np

from compressor.data import ACLIMDB
from compressor.classifier import TextClassifier
from compressor.evaluate import ClassifierEvaluator

# config
k = 2             # how many nearest neighbors to consider 
n_train = 25_000   # how many training events to include
n_test = 1_000   # how many test events to include


# load data
aclimdb = ACLIMDB(shuffle=True, download=False)

aclimdb.load_data()
X_train, X_test, y_train, y_test = aclimdb.get_train_test_data()

X_train = X_train[:n_train]
y_train = y_train[:n_train]

X_test = X_test[:n_test]
y_test = y_test[:n_test]

print("Data shapes:")
print(X_train.shape, X_test.shape)

print("Class counts:")
print("Train:", np.unique(y_train, return_counts=True))

# classifier 
classifier = TextClassifier(n_jobs=4, k=k)

classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
predictions = np.array(predictions).astype('int64')

# evaluation
evaluator = ClassifierEvaluator(y_test, predictions)
evaluator.get_report()

Data loading complete!
(25000,) (1000,)


Predicting: 100%|██████████| 1000/1000 [57:36<00:00,  3.46s/it] 

Accuracy: 0.662
Precision: 0.7214840918350609
Recall: 0.6675150772974119
F1 score: 0.642064419932564



