In [1]:
import tensorflow
from tensorflow import keras

print("Keras Version : {}".format(keras.__version__))

Keras Version : 2.11.0


In [2]:
import numpy as np
from sklearn import datasets
import gc

all_categories = ['alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware',
                  'comp.sys.mac.hardware','comp.windows.x', 'misc.forsale','rec.autos','rec.motorcycles',
                  'rec.sport.baseball','rec.sport.hockey','sci.crypt','sci.electronics','sci.med',
                  'sci.space','soc.religion.christian','talk.politics.guns','talk.politics.mideast',
                  'talk.politics.misc','talk.religion.misc']

selected_categories = ['alt.atheism','comp.graphics','rec.sport.hockey','sci.space','talk.politics.misc']

X_train, Y_train = datasets.fetch_20newsgroups(subset="train", categories=selected_categories, return_X_y=True)
X_test , Y_test  = datasets.fetch_20newsgroups(subset="test", categories=selected_categories, return_X_y=True)

X_train = np.array(X_train)
X_test = np.array(X_test)

classes = np.unique(Y_train)
mapping = dict(zip(classes, selected_categories))

len(X_train), len(X_test), classes, mapping

(2722,
 1811,
 array([0, 1, 2, 3, 4], dtype=int64),
 {0: 'alt.atheism',
  1: 'comp.graphics',
  2: 'rec.sport.hockey',
  3: 'sci.space',
  4: 'talk.politics.misc'})

In [5]:
text_vectorizer = keras.layers.TextVectorization(max_tokens=None, standardize="lower_and_strip_punctuation",
                                                 split="whitespace", output_mode="count")

text_vectorizer

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x16db7550070>

In [6]:
text_vectorizer.adapt(X_train, batch_size=512)

gc.collect()

725

In [7]:
vocab = text_vectorizer.get_vocabulary()
print("Vocab : {}".format(vocab[:10]))
print("Vocab Size : {}".format(text_vectorizer.vocabulary_size()))

out = text_vectorizer(X_train[:5])
print("Output Shape : {}".format(out.shape))

out

Vocab : ['[UNK]', 'the', 'to', 'of', 'a', 'and', 'in', 'is', 'that', 'i']
Vocab Size : 47345
Output Shape : (5, 47345)


<tf.Tensor: shape=(5, 47345), dtype=float32, numpy=
array([[  0.,   6.,   2., ...,   0.,   0.,   0.],
       [  0.,  15.,  11., ...,   0.,   0.,   0.],
       [  0.,   2.,   2., ...,   0.,   0.,   0.],
       [  0.,  36.,  31., ...,   0.,   0.,   0.],
       [  0., 145.,  66., ...,   0.,   0.,   0.]], dtype=float32)>

In [8]:
text_vectorizer = keras.layers.TextVectorization(max_tokens=50000, standardize="lower_and_strip_punctuation",
                                                 split="whitespace", output_mode="count", pad_to_max_tokens=True)

text_vectorizer.adapt(np.concatenate((X_train, X_test)), batch_size=512)

vocab = text_vectorizer.get_vocabulary()
print("Vocab : {}".format(vocab[:10]))
print("Vocab Size : {}".format(text_vectorizer.vocabulary_size()))
out = text_vectorizer(X_train[:5])
print("Output Shape : {}".format(out.shape))

Vocab : ['[UNK]', 'the', 'to', 'of', 'a', 'and', 'in', 'is', 'that', 'i']
Vocab Size : 50000
Output Shape : (5, 50000)


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

def create_model(text_vectorizer):
    return Sequential([
                        layers.Input(shape=(1,), dtype="string"),
                        text_vectorizer,
                        #layers.Dense(256, activation="relu"),
                        layers.Dense(128, activation="relu"),
                        layers.Dense(64, activation="relu"),
                        layers.Dense(len(classes), activation="softmax"),
                    ])

model = create_model(text_vectorizer)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 50000)            0         
 ectorization)                                                   
                                                                 
 dense (Dense)               (None, 128)               6400128   
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 5)                 325       
                                                                 
Total params: 6,408,709
Trainable params: 6,408,709
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])

In [11]:
history = model.fit(X_train, Y_train, batch_size=256, epochs=10, validation_data=(X_test, Y_test))
gc.collect()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


1673

In [13]:
print(X_train)
print(Y_train)

["From: markus@octavia.anu.edu.au (Markus Buchhorn)\nSubject: HDF readers/viewers\nOrganization: Australian National University, Canberra\nLines: 33\nDistribution: world\nNNTP-Posting-Host: 150.203.5.35\nOriginator: markus@octavia\n\n\n\nG'day all,\n\nCan anybody point me at a utility which will read/convert/crop/whatnot/\ndisplay HDF image files ? I've had a look at the HDF stuff under NCSA \nand it must take an award for odd directory structure, strange storage\napproaches and minimalist documentation :-)\n\nPart of the problem is that I want to look at large (5MB+) HDF files and\ncrop out a section. Ideally I would like a hdftoppm type of utility, from\nwhich I can then use the PBMplus stuff quite merrily. I can convert the cropped\npart into another format for viewing/animation.\n\nOtherwise, can someone please explain how to set up the NCSA Visualisation S/W\nfor HDF (3.2.r5 or 3.3beta) and do the above cropping/etc. This is for\nSuns with SunOS 4.1.2.\n\nAny help GREATLY apprecia

In [12]:
from sklearn.metrics import accuracy_score, classification_report

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

print("Train Accuracy : {}".format(accuracy_score(Y_train, np.argmax(train_preds, axis=1))))
print("Test  Accuracy : {}".format(accuracy_score(Y_test, np.argmax(test_preds, axis=1))))
print("\nClassification Report : ")
print(classification_report(Y_test, np.argmax(test_preds, axis=1), target_names=selected_categories))

Train Accuracy : 1.0
Test  Accuracy : 0.9458862506902264

Classification Report : 
                    precision    recall  f1-score   support

       alt.atheism       0.95      0.92      0.93       319
     comp.graphics       0.95      0.95      0.95       389
  rec.sport.hockey       0.97      0.98      0.98       399
         sci.space       0.92      0.94      0.93       394
talk.politics.misc       0.94      0.93      0.93       310

          accuracy                           0.95      1811
         macro avg       0.95      0.94      0.94      1811
      weighted avg       0.95      0.95      0.95      1811

