In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses
import os
from tensorflow.keras import preprocessing

In [2]:
base_dir = '/Users/phoelapyae/Downloads/stack_overflow_16k'
train_dir = os.path.join(base_dir, 'train')
test_dir = os.path.join(base_dir, 'test')

In [3]:
seed = 42
batch_size = 32

In [4]:
raw_train_ds = preprocessing.text_dataset_from_directory(train_dir, seed=42, validation_split=0.2, subset='training', batch_size=batch_size)
raw_val_ds = preprocessing.text_dataset_from_directory(train_dir, seed=42, validation_split=0.2, subset='validation', batch_size=batch_size)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.
Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [5]:
raw_test_ds = preprocessing.text_dataset_from_directory(test_dir, batch_size=batch_size)

Found 8000 files belonging to 4 classes.


In [6]:
import re, string

def custom_standardize(text):
    text = tf.strings.lower(text)
    stripped_text = tf.strings.regex_replace(text, '<br >', ' ')
    return tf.strings.regex_replace(stripped_text, '[%s]' % re.escape(string.punctuation), '')

In [7]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_features = 10000
maxlen = 300

vectorize_layer = TextVectorization(max_tokens=max_features, output_sequence_length=maxlen, output_mode='int', standardize=custom_standardize)

In [8]:
len(vectorize_layer.get_vocabulary())

0

In [9]:
train_data = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_data)

In [10]:
len(vectorize_layer.get_vocabulary())

10000

In [11]:
AUTOTUNE = tf.data.AUTOTUNE

raw_train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
raw_val_ds = raw_val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [12]:
model = models.Sequential([
    vectorize_layer,
    layers.Embedding(max_features + 1, 16),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dense(16, activation='relu'),
    layers.Dense(4, activation='softmax')
])
model.compile(optimizer=optimizers.Adam(), loss=losses.SparseCategoricalCrossentropy(), metrics=['acc'])

In [13]:
history = model.fit(raw_train_ds, epochs=20, validation_data=raw_val_ds)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [14]:
model.evaluate(raw_test_ds)



[0.6239853501319885, 0.7771250009536743]

In [25]:
texts, labels = next(iter(raw_test_ds))
first_question, first_label = texts[1], labels[1]
print(first_question)
print(first_label)

tf.Tensor(b'"hacking select multiple in blank, flicke i\'ve hacked html\'s &lt; select multiple > with blank, according to my customer\'s specifications:...clicking an item only toggles that item\'s selected status..other selected items stay selected....the little bit of blank remembers all selected values..when the user clicks, only the option he clicks will be selected..the blank selects the options he remembers...however, it causes a flicker effect. i doubt there is any solution for this, but i felt i had to ask just in case, does anyone know of a way to delay the rendering, or any other solution to accomplish this, without a flicker?..best regards...edit: here is the code..var choices=new array();.function prepmulti(){.    var m=document.queryselectorall(\'select\');.    for(var i=0;i&lt;m.length;i++).        if(m[i].id!=\'\'){.            m[i].onclick=toggle;.            choices.push(new array());.        }.}.function toggle(){.    var sel, x;.    for(var i=0; i&lt;this.options.le

In [26]:
import numpy as np

predict = model.predict(tf.expand_dims(first_question, -1))
print(np.argmax(predict))

2
