# Multi-class Zest

In [1]:
import numpy as np
import pandas as pd
import re
import sys
import time
import zstandard as zstd

# adding Zest classes to the system path
sys.path.insert(0, '../zest')

from preprocess_utils import normalize, tokenize, StringNormalizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from typing import Union, Dict, List
from zest import ZstdMulticlassClassifier

In [2]:
news = pd.read_csv("uci-news-aggregator.csv")

In [3]:
news_text = [normalize(s) for s in news['TITLE']]
zest_news_text = [s for s in news['TITLE']]

In [4]:
encoder = LabelEncoder()
y = encoder.fit_transform(news['CATEGORY'])

# split into train and test sets
x_train, x_test, y_train, y_test, text_train, text_test = train_test_split(news_text, y, zest_news_text, test_size=0.2)

# pull the data into vectors
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

# take a look at the shape of each of these
print(x_train.shape)
print(y_train.shape)
# print(text_train.shape)
print(x_test.shape)
print(y_test.shape)
# print(text_test.shape)

(337935, 65886)
(337935,)
(84484, 65886)
(84484,)


In [5]:
nb = MultinomialNB()
start_time = time.time()
nb.fit(x_train[:,:-1], y_train)
end_time = time.time()
print("NB train time: ", end_time - start_time)

NB train time:  0.10689401626586914


In [6]:
start_time = time.time()
print("NB score: ", nb.score(x_test[:,:-1], y_test))
end_time = time.time()
print("NB test time: ", end_time - start_time)

NB score:  0.9268974006912551
NB test time:  0.023633956909179688


In [7]:
lr = LogisticRegression(class_weight="balanced")
start_time = time.time()
lr.fit(x_train[:,:-1], y_train)
end_time = time.time()
print("LR train time: ", end_time - start_time)

LR train time:  24.45448660850525


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
start_time = time.time()
print("LR score: ", lr.score(x_test[:,:-1], y_test))
end_time = time.time()
print("LR test time: ", end_time - start_time)

LR score:  0.9447114246484541
LR test time:  0.05406594276428223


In [9]:
class_map = {}
text = list(text_train)
for i in range(y_train.shape[0]):
    curr_class = y_train[i]
    if curr_class not in class_map: 
        class_map[curr_class] = [text[i]]
    else:
        class_map[curr_class].append(text[i])

In [10]:
start_time = time.time()
cls = ZstdMulticlassClassifier(class_map, 4) # 4 dicts, level 22, min 256
end_time = time.time()
print("Zest train time: ", end_time - start_time)

4 classes;  dictionary sizes: [16384, 422344, 828304, 1234264]
Zest train time:  140.05222296714783


In [11]:
tests = list(text_test)

predicted_class = []

start_time = time.time()

for i in range(len(tests)):
    zest_scores = cls.getClassAffinities(tests[i], sort=False)
    zest_scores.sort(key=lambda x:x[1])
    lr_scores = lr.predict_proba(x_test[i, :-1])[0]
    
    # average of lr score and zest score
    pred_values = [(zest_scores[i][0] + lr_scores[i]) / 2.0 for i in range(len(zest_scores))]
    value = max(pred_values)
    pred_class = pred_values.index(value)
    predicted_class.append(pred_class)

end_time = time.time()
print("Zest + LR test time: ", end_time - start_time)

Zest + LR test time:  183.90430116653442


In [12]:
np.mean(np.array(predicted_class) == list(y_test)) #combined

0.9494342123952464

In [13]:
tests = list(text_test)

predicted_class = []
start_time = time.time()

for t in tests:
    predicted_class.append(cls.getClassAffinities(t)[0][1])

end_time = time.time()
print("Zest test time: ", end_time - start_time)

Zest test time:  82.06365728378296


In [14]:
np.mean(np.array(predicted_class) == list(y_test)) # zest only

0.9221746129444629

In [15]:
start_time = time.time()
cls = ZstdMulticlassClassifier(class_map, 2) # 2 dicts, level 22, min 256
end_time = time.time()
print("Zest train time: ", end_time - start_time)

4 classes;  dictionary sizes: [16384, 828304]
Zest train time:  67.7407591342926


In [16]:
tests = list(text_test)

predicted_class = []

start_time = time.time()

for i in range(len(tests)):
    zest_scores = cls.getClassAffinities(tests[i], sort=False)
    zest_scores.sort(key=lambda x:x[1])
    lr_scores = lr.predict_proba(x_test[i, :-1])[0]

    pred_values = [(zest_scores[i][0] + lr_scores[i]) / 2.0 for i in range(len(zest_scores))]
    value = max(pred_values)
    pred_class = pred_values.index(value)
    predicted_class.append(pred_class)

end_time = time.time()
print("Zest + LR test time: ", end_time - start_time)

Zest + LR test time:  142.45083379745483


In [17]:
np.mean(np.array(predicted_class) == list(y_test)) #combined

0.9466289474930164

In [18]:
tests = list(text_test)

predicted_class = []
start_time = time.time()

for t in tests:
    predicted_class.append(cls.getClassAffinities(t)[0][1])

end_time = time.time()
print("Zest test time: ", end_time - start_time)

Zest test time:  31.444836854934692


In [19]:
np.mean(np.array(predicted_class) == list(y_test)) # zest only

0.8867241134415984

In [20]:
start_time = time.time()
cls = ZstdMulticlassClassifier(class_map, 1) # 1 dict, level 22, min 256
end_time = time.time()
print("Zest train time: ", end_time - start_time)

4 classes;  dictionary sizes: [16384]
Zest train time:  32.15986680984497


In [21]:
tests = list(text_test)

predicted_class = []

start_time = time.time()

for i in range(len(tests)):
    zest_scores = cls.getClassAffinities(tests[i], sort=False)
    zest_scores.sort(key=lambda x:x[1])
    lr_scores = lr.predict_proba(x_test[i, :-1])[0]

    pred_values = [(zest_scores[i][0] + lr_scores[i]) / 2.0 for i in range(len(zest_scores))]
    value = max(pred_values)
    pred_class = pred_values.index(value)
    predicted_class.append(pred_class)

end_time = time.time()
print("Zest + LR test time: ", end_time - start_time)

Zest + LR test time:  132.20576310157776


In [22]:
np.mean(np.array(predicted_class) == list(y_test)) #combined

0.937278064485583

In [23]:
tests = list(text_test)

predicted_class = []
start_time = time.time()

for t in tests:
    predicted_class.append(cls.getClassAffinities(t)[0][1])

end_time = time.time()
print("Zest test time: ", end_time - start_time)

Zest test time:  14.321686029434204


In [24]:
np.mean(np.array(predicted_class) == list(y_test)) # zest only

0.7407556460394867