In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

2023-06-08 12:42:56.545706: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv("tokenized.csv", index_col=0, usecols=[0, 1, 2], dtype={"word": object, "tag": object})
df["word"] = df["word"].apply(str)

In [3]:
df.head()

Unnamed: 0,word,tag
0,404,
1,Not,
2,Found,
3,–,
4,Huxlo,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67836 entries, 0 to 67835
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   word    67836 non-null  object
 1   tag     1582 non-null   object
dtypes: object(2)
memory usage: 1.6+ MB


In [5]:
df.value_counts(subset="tag")

tag
I-PROD    797
B-PROD    785
Name: count, dtype: int64

In [6]:
df["tag"] = df["tag"].fillna("O")

In [7]:
df.value_counts(subset="tag")

tag
O         66254
I-PROD      797
B-PROD      785
Name: count, dtype: int64

In [8]:
df.nunique()

word    9032
tag        3
dtype: int64

In [9]:
df.head()

Unnamed: 0,word,tag
0,404,O
1,Not,O
2,Found,O
3,–,O
4,Huxlo,O


In [10]:
MASK_TOKEN = "[MASK]"
END_SEQUENCE_TOKEN = "END_SEQUENCE"

In [11]:
words = df.loc[df["word"] != "END_SEQUENCE", "word"].to_numpy()
words = tf.constant(words)
string_lookup = tf.keras.layers.StringLookup(max_tokens=10_000, mask_token=MASK_TOKEN)
string_lookup.adapt(words)
string_lookup.get_vocabulary()[:10]

2023-06-08 12:42:59.780446: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-08 12:42:59.816418: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-08 12:42:59.816970: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

['[MASK]', '[UNK]', '-', '.', ',', ' ', '\\/', 'x', '&', 'to']

In [12]:
model = tf.keras.Sequential([ 
    tf.keras.Input(shape=(None,), dtype=object),
    string_lookup,
    tf.keras.layers.Embedding(input_dim=string_lookup.vocabulary_size(), output_dim=64, mask_zero=True),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.TimeDistributed(
        tf.keras.layers.Dense(3, activation="relu")
    )
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 string_lookup (StringLookup  (None, None)             0         
 )                                                               
                                                                 
 embedding (Embedding)       (None, None, 64)          578112    
                                                                 
 lstm (LSTM)                 (None, None, 64)          33024     
                                                                 
 time_distributed (TimeDistr  (None, None, 3)          195       
 ibuted)                                                         
                                                                 
Total params: 611,331
Trainable params: 611,331
Non-trainable params: 0
_________________________________________________________________


In [13]:
[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

(None, None) <dtype: 'string'>
(None, None, 3) <dtype: 'float32'>
string_lookup (None, None) int64
embedding (None, None) float32
lstm (None, None, 64) float32
time_distributed (None, None, 64) float32


[None, None, None, None]

In [14]:
def get_unique_tags_lookup():
    unique_tags = df["tag"].unique()
    return {tag: index for index, tag in enumerate(unique_tags)}

def get_train_data():
    # split the dataframe into sequences
    word_sequences = []
    tag_sequences = []
    word_sequence = []
    tag_sequence = []
    tags_lookup = get_unique_tags_lookup()
    for index, word, tag in df.itertuples():
        if word == END_SEQUENCE_TOKEN:
            word_sequences.append(word_sequence)
            word_sequence = []
            tag_sequences.append(tag_sequence)
            tag_sequence = []
        else:
            word_sequence.append(str(word))
            tag_sequence.append(tags_lookup[tag])
    # pad the sequences to the have max length 
    max_sequence_length = max(map(lambda l: len(l), word_sequences))
    for word_sequence, tag_sequence in zip(word_sequences, tag_sequences):
        while len(word_sequence) < max_sequence_length:
            word_sequence.append(MASK_TOKEN)
            tag_sequence.append(tags_lookup["O"])
    return tf.constant(word_sequences, dtype=object), tf.constant(tag_sequences, dtype=tf.float32)

x_train, y_train = get_train_data()
print(f"x_train: shape={x_train.shape}, dtype={x_train.dtype}")
print(f"y_train: shape={y_train.shape}, dtype={y_train.dtype}")

x_train: shape=(98, 8864), dtype=<dtype: 'string'>
y_train: shape=(98, 8864), dtype=<dtype: 'float32'>


In [15]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
)

In [None]:
model.fit(
    x=x_train,
    y=y_train,
    batch_size=32,
    epochs=7
)

Epoch 1/7


2023-06-08 12:43:07.920359: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600


In [None]:
model.predict(x_train[:2])