# Tensorflow Dataset (sentiment140)

[Catalog for sentiment140](https://www.tensorflow.org/datasets/catalog/sentiment140)

In [1]:
# https://www.tensorflow.org/datasets/overview?hl=zh-tw

import tensorflow as tf

keras = tf.keras
tf.__version__

'2.5.0'

# Prepare Data

In [2]:
import tensorflow_datasets as tfds

# https://www.tensorflow.org/datasets/splits?hl=zh-tw
# https://www.tensorflow.org/datasets/overview?hl=zh-tw#as_tuple_as_supervisedtrue

# train_ds, valid_ds = tfds.load('sentiment140', split = ['train', 'test'])
train_ds = tfds.load('sentiment140', split = 'train[0:5%]', as_supervised = True)
valid_ds = tfds.load('sentiment140', split = 'train[5%:6%]', as_supervised = True)
test_ds = tfds.load('sentiment140', split = 'train[-20:]', as_supervised = True)

In [3]:
train_ds

<PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int32)>

In [4]:
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#take
for text, label in train_ds.take(1):
    print(text)
    print(label)
    print(text.numpy().decode('utf-8'))

tf.Tensor(b"i'm 10x cooler than all of you! ", shape=(), dtype=string)
tf.Tensor(4, shape=(), dtype=int32)
i'm 10x cooler than all of you! 


In [5]:
lines = [text.numpy().decode('utf-8') for text, _ in train_ds]
lines[0]

"i'm 10x cooler than all of you! "

In [6]:
word_set = set()

for l in lines:
    for w in l.split(' '):
        word_set.add(w)
    
VOCAB_SIZE = len(word_set)   
VOCAB_SIZE

146196

In [7]:
for index, w in enumerate(word_set):
    if index >= 10:
        break
    print(f'{index:3}: {w}')

  0: 
  1: fruits
  2: whose
  3: terrified.
  4: @therealpharrell
  5: kum
  6: workplace
  7: @SheriSalata,
  8: @mattskint
  9: twitter...


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = VOCAB_SIZE, oov_token = '<OOV>')
tokenizer.fit_on_texts(lines)
word_index = tokenizer.word_index
for index, (a, b) in enumerate(word_index.items()):
    if index >= 5:
        break
    print(a, b)

<OOV> 1
i 2
to 3
the 4
a 5


In [9]:
maxlen = 0
cnt = 0

# If batch_size cannot divide total size, the remainder will be skipped
for text_batch, label in train_ds.batch(1):
    text_batch = map(lambda text: text.numpy().decode('utf-8'), text_batch)
    sequences = tokenizer.texts_to_sequences(text_batch)
    len_of_sequences = list(map(len, sequences))
    maxlen = max(maxlen, *len_of_sequences)
    cnt = cnt + 1

print(cnt)
maxlen

80000


36

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def tokenize_and_pad(text_batch, label_batch):
    text_batch = map(lambda s: s.numpy().decode('utf-8'), text_batch)
    sequences = tokenizer.texts_to_sequences(text_batch)
    padded = pad_sequences(sequences, 
                           maxlen = maxlen, 
                           padding ='post', 
                           truncating = 'post')
    return padded, label_batch

for text_batch, label_batch in train_ds.batch(2).take(1):
    text_batch, label_batch = tokenize_and_pad(text_batch, label_batch)
    print(text_batch)
    print('\n')
    print(label_batch)
    print('\n\n')
    print(text_batch.shape, label_batch.shape)

[[   20 11880  3026   199    32    13     8     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0]
 [  321  4307   282   686     2   174   374   444   149    14   103     2
     17  1772    13   149     3  2851     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0]]


tf.Tensor([4 0], shape=(2,), dtype=int32)



(2, 36) (2,)


In [11]:
import numpy as np

def get_dataset_from_original_ds(ds, batch_size = 32, max_index = None):
    text_batch_slices = []
    label_batch_slices = []
    for index, (text_batch, label_batch) in enumerate(ds.batch(2)):
        if (max_index != None) and (index >= max_index):
            break
        text_batch, label_batch = tokenize_and_pad(text_batch, label_batch)
        text_batch_slices.append(text_batch)
        label_batch_slices.append(label_batch)
    text_slices = np.concatenate(text_batch_slices)
    label_slices = np.concatenate(label_batch_slices)
    
    # https://stackoverflow.com/questions/65230657/tensorflow-how-to-create-a-dataset-which-is-an-array-of-tuples
    return tf.data.Dataset.from_tensor_slices((text_slices, label_slices)).batch(batch_size).prefetch(1)

ds = get_dataset_from_original_ds(train_ds, max_index = 2)
sample_x = None
for x, y in ds:
    sample_x = x
    print(x.shape, ' -> ', y.shape)
    print(x, ' -> ', y, end = '\n\n')
    
print(sample_x.numpy().shape)
sample_x.numpy()

(4, 36)  ->  (4,)
tf.Tensor(
[[   20 11880  3026   199    32    13     8     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0]
 [  321  4307   282   686     2   174   374   444   149    14   103     2
     17  1772    13   149     3  2851     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0]
 [   56     5   305    31    25     3    49     3     6   173   434     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0]
 [24058    66     2    26    21   754     3 24059   313   378    60   233
    995     3    58    26    11 15380  3027     8  4521     6   897     0
      0     0     0     0     0     0     0     0     0     0     0     0]], shape=(4, 36), dtype=int32)  ->  tf.Tensor([4 0 4 4], shape=(4,), dtype=int32

array([[   20, 11880,  3026,   199,    32,    13,     8,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [  321,  4307,   282,   686,     2,   174,   374,   444,   149,
           14,   103,     2,    17,  1772,    13,   149,     3,  2851,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [   56,     5,   305,    31,    25,     3,    49,     3,     6,
          173,   434,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [24058,    66,     2,    26,    21,   754,     3, 24059,   313,
          378,    60,   233,   995,     3,    58,    26,    11, 15380,
   

In [12]:
final_train_ds = get_dataset_from_original_ds(train_ds)
final_valid_ds = get_dataset_from_original_ds(valid_ds)
final_train_ds.element_spec

(TensorSpec(shape=(None, 36), dtype=tf.int32, name=None),
 TensorSpec(shape=(None,), dtype=tf.int32, name=None))

# Build Model

In [13]:
tf.random.set_seed(1)
np.random.seed(1)
model = keras.Sequential([
    keras.layers.Embedding(VOCAB_SIZE, 16, input_length = maxlen),
    keras.layers.Flatten(),
    keras.layers.Dense(6, activation = 'relu'),
    keras.layers.Dense(5),
])

model.compile(
    optimizer = 'adam',
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits = True),
    metrics = ['accuracy'],
)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 36, 16)            2339136   
_________________________________________________________________
flatten (Flatten)            (None, 576)               0         
_________________________________________________________________
dense (Dense)                (None, 6)                 3462      
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 35        
Total params: 2,342,633
Trainable params: 2,342,633
Non-trainable params: 0
_________________________________________________________________


In [14]:
early_stopping = keras.callbacks.EarlyStopping(
    patience = 5,
    restore_best_weights = True,
)

history = model.fit(
    final_train_ds,
    epochs = 500,
    validation_data = (final_valid_ds),
    callbacks = [early_stopping],
    # verbose = 0,
)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500


# Predict on Test Data

In [15]:
final_test_ds = get_dataset_from_original_ds(test_ds)

In [16]:
predicted = map(tf.argmax, model.predict(final_test_ds))
predicted = list(map(lambda x: x.numpy(), predicted))
print(predicted)

[4, 0, 0, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4]


In [17]:
import pandas as pd

xs, ys = [], []

for x, _ in test_ds:
    xs.append(x.numpy().decode('utf-8')) 
    
for _, y in final_test_ds:
    ys.append(y.numpy())

real = np.concatenate(ys)

pd.set_option('display.max_colwidth', None)
df = pd.DataFrame({'real': real, 'predicted': predicted, 'text': xs})
# https://stackoverflow.com/questions/25351968/how-can-i-display-full-non-truncated-dataframe-information-in-html-when-conver
with pd.option_context('display.max_colwidth', -1):     
    display(df)

Unnamed: 0,real,predicted,text
0,4,4,@allygodinez How was the meet &amp; greet?
1,0,0,off to the malll ; i really dont want to do swimming in gym tomorrow
2,0,0,chemistry revision is absolute balls
3,4,4,heading out to CA where the waether looks sunny
4,4,4,Ate subway yum
5,4,4,@NKAirplay That gives me visions of a Wahlberg sandwich.
6,4,4,@brighit Says it's 'unavailable'-i'll have a look on Boombox
7,4,4,Watchin charm school @lalavazquez @riskybizness23 looooove you guys
8,4,4,I am also being a lazy bum this morning and currently sat with Pjs on and cuppa watching said rain while twittering ahhhhh bliss......
9,0,0,Watching HGTV on DVD before I go to work. I don't wanna go anywhere


# Reference

[UD187 Intro to TensorFlow for Deep Learning - Lession 9: NLP Tokenization and Embedding](https://classroom.udacity.com/courses/ud187)