In [0]:
!pip install nlp
!pip install tensorflow-text
import nlp
import numpy as np

In [0]:
# View all the available datasets 
print([dataset.id for dataset in nlp.list_datasets()])

In [0]:
# Load dataset
news_dataset = nlp.load_dataset('civil_comments')

In [0]:
!ls /root/.cache/huggingface/datasets/civil_comments/default/0.9.0

In [0]:
print('training set contains: ' + str(len(news_dataset['train'])) + ' examples')
print('test set contains: ' + str(len(news_dataset['test'])) + ' examples')

In [0]:
print('Positive example.')
print(news_dataset['train'][0])

In [0]:
print('Negative example.')
print(news_dataset['train'][4])

In [0]:
train_texts = []
train_labels = []
train_dataset = news_dataset['train'][:100000]
for index in range(100000):
  train_texts.append(train_dataset['text'][index])
  train_labels.append(train_dataset['toxicity'][index])

train_texts = np.array(train_texts)
train_labels = np.array(train_labels)

In [0]:
test_texts = []
test_labels = []
test_dataset = news_dataset['test'][:20000]
for index in range(20000):
  test_texts.append(test_dataset['text'][index])
  test_labels.append(test_dataset['toxicity'][index])

test_texts = np.array(test_texts)
test_labels = np.array(test_labels)

In [0]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [0]:
tf.__version__

In [0]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_texts, train_labels)).shuffle(50000)
test_dataset = tf.data.Dataset.from_tensor_slices((test_texts, test_labels))

In [0]:
for ex in train_dataset.take(5):
  print(ex)

In [0]:
for ex in test_dataset.take(2):
  print(ex)

In [0]:
tokenizer = tfds.features.text.Tokenizer()

vocab = set()
for text_tensor, _ in train_dataset:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocab.update(some_tokens)

for text_tensor, _ in test_dataset:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocab.update(some_tokens)

vocab_size = len(vocab)
print(vocab_size)

In [0]:
encoder = tfds.features.text.TokenTextEncoder(vocab)

In [0]:
example_text = next(iter(train_dataset))[0].numpy()
example_text

In [0]:
encoder.encode(example_text)

In [0]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

In [0]:
def encode_map_fn(text_tensor, label):
  encoded_text, label = tf.py_function(encode, inp=[text_tensor, label],
                                       Tout=[tf.int64, tf.float64])
  encoded_text.set_shape([None])
  label.set_shape([])
  return encoded_text, label

In [0]:
train_data = train_dataset.map(encode_map_fn).padded_batch(8)
test_data = test_dataset.map(encode_map_fn).padded_batch(8)

In [0]:
for ex in train_data.take(2):
  print(ex[0][0], ex[1][0])

In [0]:
vocab_size += 1

In [0]:
model = tf.keras.models.Sequential()

In [0]:
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [0]:
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

model.add(tf.keras.layers.Dense(1))

In [0]:
model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError(), metrics='accuracy')

In [0]:
model.summary()

In [0]:
model.fit(train_data, epochs=5, validation_data=test_data)