In [43]:
import os
import re
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import vowpalwabbit

**Loading https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html dataset for 20 class classifcation problem**

In [44]:
!mkdir -p RL-Transformer

newsgroups= fetch_20newsgroups()

all_documents = newsgroups["data"]
topic_encoder = LabelEncoder()
all_targets_mult = topic_encoder.fit_transform(newsgroups["target"]) + 1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [45]:
#!pip install sentence-transformers
from sentence_transformers import SentenceTransformer

In [46]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [47]:
query = "I had pizza and pasta"
query_vec = sbert_model.encode([query])[0]
#print(query_vec)
print(len(query_vec))

768


**Converting Text to VW format**

In [48]:
def to_vw_format_vector(text, label=None):
    vector = sbert_model.encode([text])[0]
    data = str(label or "") + " | "
    for i in vector:
        data+=str(i)+" "
    return data+"\n"

def to_vw_format(document, label=None):
    return (
        str(label or "")
        + " |text "
        + " ".join(re.findall("\w{3,}", document.lower()))
        + "\n"
    )

In [49]:
PATH_TO_WRITE_DATA="./RL-Transformer"

**Make Test and Train split and divide convert all the data to vw format**

In [51]:
train_documents, test_documents, train_labels, test_labels = train_test_split(
    all_documents[:10000], all_targets_mult[:10000], random_state=7
)

with open(os.path.join(PATH_TO_WRITE_DATA, "20news_train_mult.vw"), "w") as vw_train_data:
    for text, target in zip(train_documents, train_labels):
        vw_train_data.write(to_vw_format_vector(text, target))
with open(os.path.join(PATH_TO_WRITE_DATA, "20news_test_mult.vw"), "w") as vw_test_data:
    for text in test_documents:
        vw_test_data.write(to_vw_format_vector(text))

**Training**

In [52]:
#!vw -d RL/20news_train.vw  --loss_function hinge -f RL/20news_model.vw
!vw --oaa 20 ./RL-Transformer/20news_train_mult.vw -f ./RL-Transformer/20news_model_mult.vw --loss_function=hinge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
final_regressor = ./RL-Transformer/20news_model_mult.vw
using no cache
Reading datafile = ./RL-Transformer/20news_train_mult.vw
num sources = 1
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
Enabled learners: gd, scorer-identity, oaa
Input label = MULTICLASS
Output pred = MULTICLASS
average  since         example        example        current        current  current
loss     last          counter         weight          label        predict features
[32m[info][m label 20 found -- labels are now considered 1-indexed.
1.000000 1.000000            1            1.0             20              1      769
1.000000 1.000000            2            2.0             13             20      769
1

**Inference**

In [53]:
!vw -i RL-Transformer/20news_model_mult.vw -t -d RL-Transformer/20news_test_mult.vw -p RL-Transformer/20news_test_predictions_mult.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
only testing
predictions = RL-Transformer/20news_test_predictions_mult.txt
using no cache
Reading datafile = RL-Transformer/20news_test_mult.vw
num sources = 1
Num weight bits = 18
learning rate = 0.5
initial_t = 7500
power_t = 0.5
Enabled learners: gd, scorer-identity, oaa
Input label = MULTICLASS
Output pred = MULTICLASS
average  since         example        example        current        current  current
loss     last          counter         weight          label        predict features
n.a.     n.a.                1            1.0        unknown             12      769
n.a.     n.a.                2            2.0        unknown             19      769
n.a.     n.a.                4            4.0       

**Results**

In [54]:
with open(os.path.join(PATH_TO_WRITE_DATA, "20news_test_predictions_mult.txt")) as pred_file:
    test_prediction_mult = [float(label) for label in pred_file.readlines()]

In [55]:
print(accuracy_score(test_labels, test_prediction_mult))

0.0492


In [56]:
M = confusion_matrix(test_labels, test_prediction_mult)
for i in np.where(M[0, :] > 0)[0][1:]:
    print(newsgroups["target_names"][i], M[0, i])

comp.graphics 7
comp.os.ms-windows.misc 5
comp.sys.ibm.pc.hardware 2
comp.sys.mac.hardware 4
comp.windows.x 9
misc.forsale 5
rec.autos 1
rec.motorcycles 5
rec.sport.baseball 3
rec.sport.hockey 3
sci.crypt 7
sci.electronics 17
sci.med 2
sci.space 2
soc.religion.christian 8
talk.politics.guns 2
talk.politics.mideast 10
talk.politics.misc 3
talk.religion.misc 1
