In [1]:
import pandas as pd

train_ds = pd.read_csv('train.csv')

In [2]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

encoder = LabelEncoder()

train_ds['keyword'] = encoder.fit_transform(train_ds['keyword'])
train_ds['location'] = encoder.fit_transform(train_ds['location'])

scaler = MinMaxScaler()
train_ds['keyword'] = scaler.fit_transform(train_ds[['keyword']])
train_ds['location'] = scaler.fit_transform(train_ds[['location']])

In [3]:
train_ds.drop('id', axis=1, inplace=True)

In [4]:
from sklearn.model_selection import train_test_split

X = train_ds.drop('target', axis=1, inplace=False)
y = train_ds['target'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

texts = X['text'].values

text_vectorization = TextVectorization(
    max_tokens=20000,
    output_mode="multi_hot",
)

text_vectorization.adapt(texts)

2025-01-13 11:24:22.434192: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-13 11:24:22.435680: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-13 11:24:22.447043: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-13 11:24:22.465372: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736785462.486412   88904 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736785462.49

In [6]:
X_train_text = text_vectorization(X_train['text'])
X_train_cat = X_train[['keyword', 'location']].values.astype('float32')

X_val_text = text_vectorization(X_val['text'])
X_val_cat = X_val[['keyword', 'location']].values.astype('float32')

X_test_text = text_vectorization(X_test['text'])
X_test_cat = X_test[['keyword', 'location']].values.astype('float32')


In [7]:
X_train_combined = tf.concat([X_train_cat, X_train_text], axis=1)
X_val_combined = tf.concat([X_val_cat, X_val_text], axis=1)
X_test_combined = tf.concat([X_test_cat, X_test_text], axis=1)

y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.int32)
y_val_tensor = tf.convert_to_tensor(y_val, dtype=tf.int32)
y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.int32)

In [8]:
import tensorflow as tf

binary_1gram_train_ds = tf.data.Dataset.from_tensor_slices((X_train_combined, y_train))

binary_1gram_val_ds = tf.data.Dataset.from_tensor_slices((X_val_combined, y_val))

binary_1gram_test_ds = tf.data.Dataset.from_tensor_slices((X_test_combined, y_test))


In [9]:
batch_size = 32
binary_1gram_train_ds = binary_1gram_train_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
binary_1gram_val_ds = binary_1gram_val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
binary_1gram_test_ds = binary_1gram_test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [10]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20002, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model
  
    

In [11]:
model = get_model()
model.summary()

In [12]:
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_1gram.keras",
                                   save_best_only=True)
]
model.fit(
    binary_1gram_train_ds,
    validation_data=binary_1gram_val_ds,
    epochs=10,
    callbacks=callbacks
)
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Epoch 1/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6343 - loss: 0.6624 - val_accuracy: 0.7768 - val_loss: 0.5745
Epoch 2/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7944 - loss: 0.5368 - val_accuracy: 0.7853 - val_loss: 0.5074
Epoch 3/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8196 - loss: 0.4620 - val_accuracy: 0.7905 - val_loss: 0.4771
Epoch 4/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8455 - loss: 0.4054 - val_accuracy: 0.7984 - val_loss: 0.4638
Epoch 5/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8518 - loss: 0.3774 - val_accuracy: 0.7997 - val_loss: 0.4598
Epoch 6/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8677 - loss: 0.3516 - val_accuracy: 0.8050 - val_loss: 0.4589
Epoch 7/10
[1m172/172[0m 