In [1]:
import pandas as pd

train_ds = pd.read_csv('train.csv')

In [2]:
train_ds.drop('id', axis=1, inplace=True)

In [3]:
from sklearn.model_selection import train_test_split

texts = train_ds['text'].values
labels = train_ds['target'].values

X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(
    max_tokens=20000,
    output_mode="multi_hot",
)

text_vectorization.adapt(texts)

dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))

dataset_val = tf.data.Dataset.from_tensor_slices((X_val, y_val))

dataset_test = tf.data.Dataset.from_tensor_slices((X_test, y_test))

binary_1gram_train_ds = dataset.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=tf.data.AUTOTUNE)

binary_1gram_val_ds = dataset.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=tf.data.AUTOTUNE)

binary_1gram_test_ds = dataset_test.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=tf.data.AUTOTUNE)


2025-01-13 09:19:23.724058: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-13 09:19:23.724528: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-13 09:19:23.726740: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-13 09:19:23.733636: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736777963.744818   62475 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736777963.74

In [5]:
batch_size = 32
binary_1gram_train_ds = binary_1gram_train_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
binary_1gram_val_ds = binary_1gram_val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
binary_1gram_test_ds = binary_1gram_test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [6]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model
  
    

In [7]:
model = get_model()
model.summary()

In [8]:
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_1gram.keras",
                                   save_best_only=True)
]
model.fit(
    binary_1gram_train_ds,
    validation_data=binary_1gram_val_ds,
    epochs=10,
    callbacks=callbacks
)
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Epoch 1/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6355 - loss: 0.6562 - val_accuracy: 0.8126 - val_loss: 0.5338
Epoch 2/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7879 - loss: 0.5297 - val_accuracy: 0.8460 - val_loss: 0.4379
Epoch 3/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8294 - loss: 0.4460 - val_accuracy: 0.8639 - val_loss: 0.3780
Epoch 4/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8461 - loss: 0.4047 - val_accuracy: 0.8774 - val_loss: 0.3377
Epoch 5/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8651 - loss: 0.3580 - val_accuracy: 0.8869 - val_loss: 0.3088
Epoch 6/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8697 - loss: 0.3412 - val_accuracy: 0.8935 - val_loss: 0.2873
Epoch 7/10
[1m172/172[0m 