In [14]:
import pandas as pd

train_ds = pd.read_csv('train.csv')

In [22]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

train_ds['keyword'] = encoder.fit_transform(train_ds['keyword'])
train_ds['location'] = encoder.fit_transform(train_ds['location'])

In [23]:
train_ds.drop('id', axis=1, inplace=True)

In [24]:
from sklearn.model_selection import train_test_split

X = train_ds.drop('target', axis=1, inplace=False)
y = train_ds['target'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [32]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

texts = X['text'].values

text_vectorization = TextVectorization(
    max_tokens=20000,
    output_mode="multi_hot",
)

text_vectorization.adapt(texts)

In [42]:
X_train_text = text_vectorization(X_train['text'])
X_train_cat = X_train[['keyword', 'location']].values.astype('float32')

X_val_text = text_vectorization(X_val['text'])
X_val_cat = X_val[['keyword', 'location']].values.astype('float32')

X_test_text = text_vectorization(X_test['text'])
X_test_cat = X_test[['keyword', 'location']].values.astype('float32')


In [43]:
X_train_combined = tf.concat([X_train_cat, X_train_text], axis=1)
X_val_combined = tf.concat([X_val_cat, X_val_text], axis=1)
X_test_combined = tf.concat([X_test_cat, X_test_text], axis=1)

y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.int32)
y_val_tensor = tf.convert_to_tensor(y_val, dtype=tf.int32)
y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.int32)

In [45]:
import tensorflow as tf

binary_1gram_train_ds = tf.data.Dataset.from_tensor_slices((X_train_combined, y_train))

binary_1gram_val_ds = tf.data.Dataset.from_tensor_slices((X_val_combined, y_val))

binary_1gram_test_ds = tf.data.Dataset.from_tensor_slices((X_test_combined, y_test))


In [46]:
batch_size = 32
binary_1gram_train_ds = binary_1gram_train_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
binary_1gram_val_ds = binary_1gram_val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
binary_1gram_test_ds = binary_1gram_test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [50]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20002, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model
  
    

In [51]:
model = get_model()
model.summary()

In [52]:
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_1gram.keras",
                                   save_best_only=True)
]
model.fit(
    binary_1gram_train_ds,
    validation_data=binary_1gram_val_ds,
    epochs=10,
    callbacks=callbacks
)
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Epoch 1/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.5495 - loss: 1.3764 - val_accuracy: 0.5739 - val_loss: 0.6884
Epoch 2/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5713 - loss: 0.7052 - val_accuracy: 0.5739 - val_loss: 0.6859
Epoch 3/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5713 - loss: 0.6980 - val_accuracy: 0.5739 - val_loss: 0.6841
Epoch 4/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5713 - loss: 0.7071 - val_accuracy: 0.5739 - val_loss: 0.6832
Epoch 5/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5713 - loss: 0.6900 - val_accuracy: 0.5739 - val_loss: 0.6831
Epoch 6/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5713 - loss: 0.6859 - val_accuracy: 0.5739 - val_loss: 0.6829
Epoch 7/10
[1m172/172[0m 