# "Low-Resource" Text Classification: A Parameter-Free Classification Method with Compressors

Jiang et. al, Association for Computational Linguistics: ACL 2023, pages 6810-6828

Normalized Compression Distance (NCD) utilizing compressed length $C(x)$ to approximate Kolmogorov complexity $K(x)$

$NCD(x, y) = \cfrac{C(xy) - \min{\left( C(x), C(y) \right)}}{\max{\left( C(x), C(y) \right)}}$

Where in this case, $C(x)$ means the length of $x$ after being compressed by gzip. $C(xy)$ is the compressed length of concatenation of $x$ and $y$. With the distance matrix NCD provides, we can then use $k$-nearest-neighbor to perform classification.

In [136]:
%config Completer.use_jedi = False

In [4]:
%%capture
!pip install datasets

In [133]:
import numpy as np
import pandas as pd
import datasets
import gzip

from sklearn import metrics

In [195]:
def gzip_compress(x):
    return len(gzip.compress(x.encode()))

def ncd(Cx1, Cx2, Cx1x2):
    return (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)

def ncd_gzip(training_set, test_set, content, k):
    res = []

    for x1 in test_set:
        
        Cx1 = compress(x1[content])

        distance_from_x1 = []

        for x2 in training_set:

            Cx2 = compress(x2[content])

            x1x2 = " ".join([x1[content], x2[content]])
            Cx1x2 = compress(x1x2)

            distance = ncd(Cx1, Cx2, Cx1x2)
            distance_from_x1.append(distance)

        sorted_idx = np.argsort(np.array(distance_from_x1))
        top_k_class = training_set[sorted_idx[:k]]['label']

        predict_class = max(set(top_k_class), key=top_k_class.count)

        res.append([predict_class, x1['label']])
    
    return pd.DataFrame(res, columns=['predict', 'true'])

### Testing on the 20News dataset

Dataset with highest number of words by document (avg. of 406 words), written in English

In [189]:
from datasets import load_dataset

dataset = load_dataset("SetFit/20_newsgroups")

Found cached dataset json (/home/fausto/.cache/huggingface/datasets/SetFit___json/SetFit--20_newsgroups-f05bfc706e284479/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/2 [00:00<?, ?it/s]

In [204]:
training_set = dataset['train'].shard(num_shards=2, index=0)
test_set = dataset['test'].shard(num_shards=200, index=0)

print('Training set size:', training_set.shape)
print('Test set size:', test_set.shape)

Training set size: (5657, 3)
Test set size: (38, 3)


In [205]:
res = ncd_gzip(training_set, test_set, content='text', k=5)

In [207]:
print(metrics.classification_report(res['predict'], res['true']))

              precision    recall  f1-score   support

           0       0.67      0.25      0.36         8
           1       0.00      0.00      0.00         3
           2       0.33      0.25      0.29         4
           3       0.50      0.14      0.22         7
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         1
           6       1.00      0.67      0.80         3
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         1
           9       0.33      1.00      0.50         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         0
          14       0.20      1.00      0.33         1
          15       0.00      0.00      0.00         0
          16       1.00      1.00      1.00         2
          18       0.00      0.00      0.00         0
          19       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [208]:
metrics.confusion_matrix(res['predict'], res['true'])

array([[2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,

### Testing on the SogouNews dataset

Dataset with highest number of words by document (avg. of 589 words), written in foreign language Sogou

In [209]:
from datasets import load_dataset

dataset = load_dataset("sogou_news")

Found cached dataset sogou_news (/home/fausto/.cache/huggingface/datasets/sogou_news/default/0.0.0/dd1f148239e73c4200e6965abe37873b6bff9f511d3a7b290338d3750e780cf1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [210]:
training_set = dataset['train'].shard(num_shards=500, index=0)
test_set = dataset['test'].shard(num_shards=1000, index=0)

print('Training set size:', training_set.shape)
print('Test set size:', test_set.shape)

Training set size: (900, 3)
Test set size: (60, 3)


In [211]:
res = ncd_gzip(training_set, test_set, content='content', k=5)

In [212]:
df_res = pd.DataFrame(res, columns=['predict', 'true'])

print(metrics.classification_report(df_res['predict'], df_res['true']))

              precision    recall  f1-score   support

           0       0.86      0.71      0.77        17
           1       1.00      0.71      0.83         7
           2       0.53      0.80      0.64        10
           3       0.94      0.89      0.91        18
           4       0.89      1.00      0.94         8

    accuracy                           0.82        60
   macro avg       0.84      0.82      0.82        60
weighted avg       0.85      0.82      0.82        60



In [213]:
metrics.confusion_matrix(df_res['predict'], df_res['true'])

array([[12,  0,  4,  0,  1],
       [ 0,  5,  2,  0,  0],
       [ 1,  0,  8,  1,  0],
       [ 1,  0,  1, 16,  0],
       [ 0,  0,  0,  0,  8]])