# Gzip as a Machine Learning Classifier

In [1]:
from __future__ import annotations
import gzip
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.estimator_checks import check_estimator
from sklearn import datasets

In [3]:
pd.read_csv("foo.csv").head()

Unnamed: 0,a,b,c,d,e,y
0,1,2,3,4,5,0
1,1,2,3,4,5,0
2,1,2,3,4,5,0
3,1,2,3,4,5,0
4,1,2,3,4,5,0


In [4]:
def gzip_size(filename: str, append: str | None = None) -> int:
    with open(filename, "rb") as f:
        content = f.read()

    if append is not None:
        content += bytes(append, "utf-8")

    return len(gzip.compress(content))


print(gzip_size("foo.csv"))
print(gzip_size("foo.csv", "1,2,3,4,5,0"))
print(gzip_size("foo.csv", "1,2,3,4,5,1"))

45
45
46


In [5]:
class GzipClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, label_set: list):
        self.label_set = label_set

    def fit(self, X: str, y=None):
        self.filename = X
        self.base_size = gzip_size(X)

    def predict(self, X: list[str]):
        out = []
        for x in X:
            sizes = {label: gzip_size(self.filename, x + f",{label}") for label in self.label_set}
            out.append(min(sizes, key=sizes.get))

        return out


In [7]:
clf = GzipClassifier([0, 1])
clf.fit("foo.csv")
clf.predict(["1,2,3,4,5"])

[0]

In [8]:
X, y = datasets.load_iris(return_X_y=True, as_frame=True)
X.to_csv(index=False)


'sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)\n5.1,3.5,1.4,0.2\n4.9,3.0,1.4,0.2\n4.7,3.2,1.3,0.2\n4.6,3.1,1.5,0.2\n5.0,3.6,1.4,0.2\n5.4,3.9,1.7,0.4\n4.6,3.4,1.4,0.3\n5.0,3.4,1.5,0.2\n4.4,2.9,1.4,0.2\n4.9,3.1,1.5,0.1\n5.4,3.7,1.5,0.2\n4.8,3.4,1.6,0.2\n4.8,3.0,1.4,0.1\n4.3,3.0,1.1,0.1\n5.8,4.0,1.2,0.2\n5.7,4.4,1.5,0.4\n5.4,3.9,1.3,0.4\n5.1,3.5,1.4,0.3\n5.7,3.8,1.7,0.3\n5.1,3.8,1.5,0.3\n5.4,3.4,1.7,0.2\n5.1,3.7,1.5,0.4\n4.6,3.6,1.0,0.2\n5.1,3.3,1.7,0.5\n4.8,3.4,1.9,0.2\n5.0,3.0,1.6,0.2\n5.0,3.4,1.6,0.4\n5.2,3.5,1.5,0.2\n5.2,3.4,1.4,0.2\n4.7,3.2,1.6,0.2\n4.8,3.1,1.6,0.2\n5.4,3.4,1.5,0.4\n5.2,4.1,1.5,0.1\n5.5,4.2,1.4,0.2\n4.9,3.1,1.5,0.2\n5.0,3.2,1.2,0.2\n5.5,3.5,1.3,0.2\n4.9,3.6,1.4,0.1\n4.4,3.0,1.3,0.2\n5.1,3.4,1.5,0.2\n5.0,3.5,1.3,0.3\n4.5,2.3,1.3,0.3\n4.4,3.2,1.3,0.2\n5.0,3.5,1.6,0.6\n5.1,3.8,1.9,0.4\n4.8,3.0,1.4,0.3\n5.1,3.8,1.6,0.2\n4.6,3.2,1.4,0.2\n5.3,3.7,1.5,0.2\n5.0,3.3,1.4,0.2\n7.0,3.2,4.7,1.4\n6.4,3.2,4.5,1.5\n6.9,3.1,4.9,1.5\n5.5,2.3,4.0,1.3\n6.5,2.8,4.