# Multi-class Zest on news articles

In [1]:
import numpy as np
import pandas as pd
import re
import sys
import time
import zstandard as zstd

# adding Zest classes to the system path
sys.path.insert(0, '../zest')

from preprocess_utils import StringNormalizer, normalize, tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from typing import Union, Dict, List
from zest import ZstdMulticlassClassifier

In [2]:
news = pd.read_csv("news-article-categories.csv")

In [3]:
news = news[news.isnull().any(axis=1) != True]

In [4]:
news_title = [normalize(s) for s in news['title']]
zest_news_title = [s for s in news['title']]
news_body = [normalize(s) for s in news['body']]
zest_news_body = [s for s in news['title']]
news_category = [s for s in news['category']]

In [5]:
news_data = [news_title[i] + ' ' + news_body[i] for i in range(len(news_title))]
zest_news_data = [zest_news_title[i] + ' ' + zest_news_body[i] for i in range(len(news_title))]

In [6]:
encoder = LabelEncoder()
y = encoder.fit_transform(news_category)

# split into train and test sets
x_train, x_test, y_train, y_test, text_train, text_test = train_test_split(news_body, y, zest_news_body, test_size=0.2)

# pull the data into vectors
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

# take a look at the shape of each of these
print(x_train.shape)
print(y_train.shape)
# print(text_train.shape)
print(x_test.shape)
print(y_test.shape)
# print(text_test.shape)

(5497, 87400)
(5497,)
(1375, 87400)
(1375,)


In [7]:
nb = MultinomialNB()
start_time = time.time()
nb.fit(x_train[:,:-1], y_train)
end_time = time.time()
print("NB train time: ", end_time - start_time)

NB train time:  0.06722784042358398


In [8]:
start_time = time.time()
print("NB score: ", nb.score(x_test[:,:-1], y_test))
end_time = time.time()
print("NB test time: ", end_time - start_time)

NB score:  0.7098181818181818
NB test time:  0.01630377769470215


In [9]:
lr = LogisticRegression(class_weight="balanced")
start_time = time.time()
lr.fit(x_train[:,:-1], y_train)
end_time = time.time()
print("LR train time: ", end_time - start_time)

LR train time:  30.197779893875122


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [10]:
start_time = time.time()
print("LR score: ", lr.score(x_test[:,:-1], y_test))
end_time = time.time()
print("LR test time: ", end_time - start_time)

LR score:  0.7752727272727272
LR test time:  0.03171896934509277


In [11]:
class_map = {}
text = list(text_train)
for i in range(y_train.shape[0]):
    curr_class = y_train[i]
    if curr_class not in class_map: 
        class_map[curr_class] = [text[i]]
    else:
        class_map[curr_class].append(text[i])

In [12]:
start_time = time.time()
cls = ZstdMulticlassClassifier(class_map, 4) # 4 dicts, level 22, min 256
end_time = time.time()
print("Zest train time: ", end_time - start_time)

14 classes;  dictionary sizes: [3393, 7490, 11588, 15686]
Zest train time:  2.2762749195098877


In [13]:
tests = list(text_test)

predicted_class = []

start_time = time.time()

for i in range(len(tests)):
    zest_scores = cls.getClassAffinities(tests[i], sort=False)
    zest_scores.sort(key=lambda x:x[1])
    lr_scores = lr.predict_proba(x_test[i, :-1])[0]
    
    # average of lr score and zest score
    pred_values = [(zest_scores[i][0] + lr_scores[i]) / 2.0 for i in range(len(zest_scores))]
    value = max(pred_values)
    pred_class = pred_values.index(value)
    predicted_class.append(pred_class)

end_time = time.time()
print("Zest + LR test time: ", end_time - start_time)

Zest + LR test time:  11.675457000732422


In [14]:
np.mean(np.array(predicted_class) == list(y_test)) #combined

0.7818181818181819

In [15]:
tests = list(text_test)

predicted_class = []
start_time = time.time()

for t in tests:
    predicted_class.append(cls.getClassAffinities(t)[0][1])

end_time = time.time()
print("Zest test time: ", end_time - start_time)

Zest test time:  4.469358682632446


In [16]:
np.mean(np.array(predicted_class) == list(y_test)) # zest only

0.46836363636363637

In [18]:
tests = list(text_test)

predicted_class = []
start_time = time.time()

for t in tests:
    predicted_class.append(cls.getClassAffinities(t, sort=True, double=False)[0][1])

end_time = time.time()
print("Zest test time: ", end_time - start_time)

Zest test time:  1.2235162258148193


In [19]:
np.mean(np.array(predicted_class) == list(y_test)) # zest only

0.448