### Reference:
https://ithelp.ithome.com.tw/articles/10223922
<br>
https://github.com/johngilbert2000/sentiment140_with_fastai/blob/master/TF_NLP_sentiment140.ipynb
<br>
https://blog.csdn.net/xc_zhou/article/details/88669669

In [1]:
import numpy as np
import pandas as pd

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.callbacks import TensorBoard

In [2]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

### Loading Training Data from csv file

In [3]:
df = pd.read_csv("dataset/training.1600000.processed.noemoticon.csv", usecols=[0,5], header=None, names=['label','text'],encoding='latin-1')

In [4]:
df.tail(5)

Unnamed: 0,label,text
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,happy #charitytuesday @theNSPCC @SparksCharity...


In [5]:
df.label.value_counts()

0    800000
4    800000
Name: label, dtype: int64

In [6]:
# PREPARE DATA

# Randomize data
df = df.iloc[np.random.permutation(len(df))]

# For Binary Classification: Convert Labels to 0 and 1 
# df.loc[df['label'] == 4, 'label'] = 1

# Classfication: Covert all labels {negative = 0, neutral = 1,positive = 2}
df.loc[df['label'] == 4, 'label'] = 2
df.head()

Unnamed: 0,label,text
1546285,2,@Super_fresh Your welcome..... I caught it!
938301,2,"@jophillips Origin of Symmetry is their best, ..."
1216763,2,@emmao414 Mornin! I was super-tired this morn ...
424033,0,my days are all mixed up i thought today was ...
590526,0,@chipcoffey Can you do me a favor and pray for...


In [7]:
# CREATE SUBSETS
splits = 5
# Equal length subsets of original dataframe
cut_indices = [int(i*(1/splits)*len(df)) for i in range(0,splits+1)] # indices where df is to be cut
segment_indices = zip(cut_indices[:-1], cut_indices[1:]) # indices for each cut segment
valids = [df[begin:end] for begin,end in segment_indices] # subsets each to be used as validation sets

# Training sets for each validation set in valids
trains = [pd.concat(valids[1:], axis=0)]
for n in range(1,splits):
    trains += [pd.concat(valids[:n]+valids[n+1:], axis=0)] # (all sets except for set n in range(splits))

# Validation and Training Sets to be used
validation = valids[0]
training = trains[0]

In [8]:
# a mean close to 0.5 for labels indicates a well balanced dataset
validation['label'].describe()['mean']

0.99930625

In [9]:
training.head()

Unnamed: 0,label,text
789494,0,@Krankitupmag LOL mentally yes but physically ...
70245,0,@jason_mraz so sad I got to Indie today at 3:0...
414313,0,Head hurts fro holding back the tears of seein...
1117470,2,getting ready 2 go out
1338472,2,@Brantanamo there's nothing wrong with being a...


In [10]:
# VECTORIZE DATASET

vocab_size = 10000 #5000 # 10000

text = training['text'].to_numpy()

tok = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token='<unk>')

tok.fit_on_texts(text)

tok.word_index['<pad>'] = 0
tok.index_word[0] = '<pad>'

# pad vectors to maxlength
train_text = training['text'].to_numpy()
train_seqs = tok.texts_to_sequences(train_text)
maxlength = max(len(i) for i in train_seqs)
train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=maxlength, padding='post')

print(maxlength)

52


In [11]:
# train_labels = training['label'].to_numpy().flatten()
train_labels = training['label'].to_numpy()
train_labels = to_categorical(train_labels, num_classes=3, dtype='uint8')

# pad vectors to maxlength (don't calculate automatically, or valid set will be of different size)
valid_text = validation['text'].to_numpy()
valid_seqs = tok.texts_to_sequences(valid_text)
valid_seqs = tf.keras.preprocessing.sequence.pad_sequences(valid_seqs, maxlen=maxlength, padding='post')

# valid_labels = validation['label'].to_numpy().flatten()
valid_labels = validation['label'].to_numpy()
valid_labels = to_categorical(valid_labels, num_classes=3, dtype='uint8')

# Use lowest possible types to speed up training
train_seqs = train_seqs.astype('uint16')
valid_seqs = valid_seqs.astype('uint16')
# train_labels = train_labels.astype('bool')
# valid_labels = valid_labels.astype('bool') # uint8 or bool?

# Convert to TF dataset format
train_ds = tf.data.Dataset.from_tensor_slices((train_seqs,train_labels))
valid_ds = tf.data.Dataset.from_tensor_slices((valid_seqs,valid_labels))

In [12]:
import typing
import numbers
import os
import unittest
import random
import copy

In [13]:
# USEFUL FUNCTIONS

def word2vec(words):
    if isinstance(words, str):
        "Takes a string of words and returns a list of corresponding integers"
        seq = tok.texts_to_sequences([words])
        return np.array(seq).flatten().tolist()
    elif isinstance(words, typing.Iterable):
        "Takes a list of strings and returns a list of sequences (lists of corresponding integers)"
        return tok.texts_to_sequences(words)
    else:
        raise ValueError(f'Words were of type {type(words)} but should be either a string or list of strings')
        

def vec2word(vec:typing.Iterable[typing.Any]):
    if isinstance(vec[0], numbers.Number):
        "Takes a list of ints and returns a corresponding string"
        return " ".join([list(tok.word_index.keys())[i-1] for i in vec])
    elif isinstance(vec[0], typing.Iterable):
        "Takes an array of sequences (i.e., a 2d array) and returns an array of strings"
        return [vec2word(i) for i in vec]
    else:
        raise ValueError( f'Input list should contain either ints or lists of ints, not {type(vec[0])}')

def vec2word_no_pad(vec:typing.Iterable[typing.Any]):
    "Removes padding and converts vectors of ints to strings"
    if isinstance(vec[0], numbers.Number):
        "Takes a list of ints and returns a corresponding string"
        return " ".join([list(tok.word_index.keys())[i-1] for i in vec if i != 0])
    elif isinstance(vec[0], typing.Iterable):
        "Takes an array of sequences (i.e., a 2d array) and returns an array of strings"
        return [vec2word(i) for i in vec if i != 0]
    else:
        raise ValueError( f'Input list should contain either ints or lists of ints, not {type(vec[0])}')

def show_batch(ds):
    "Takes a tensorflow dataset and returns a batch as a dataframe, with labels shown without padding"
    batch_vecs, batch_targets = next(iter(ds.batch(1)))  # iterate through dataset batches
    batch_vecs, batch_targets = np.array(batch_vecs)[0], np.array(batch_targets)[0]  # convert tf batch to np array & reduce dimension by 1
    return pd.DataFrame(zip(batch_vecs, [vec2word_no_pad(arr) for arr in batch_vecs], batch_targets), columns=['word_vec','text','target'])

In [14]:
[32*i for i in range(1,10)] # Choose a multiple of 32 for embedding dimension

[32, 64, 96, 128, 160, 192, 224, 256, 288]

In [15]:
# TRAINING PARAMETERS

# Calculates the max_length, which can be used to store the attention weights
maxlength = max(len(i) for i in train_seqs)
total_vocab_size = len(tok.word_index) # no need to add +1, word_index includes <pad>
batch_size = 512 # 256 # 128 # 64
buffer_size = 1000 # 500 # 1000
embedding_dim = 64 # 32 # 64 # 128 # 256
num_steps = len(train_text) // batch_size
epochs = num_steps // buffer_size
val_steps = len(valid_seqs) // batch_size // epochs
learning_rate = 0.001 * 8

print(maxlength, vocab_size, total_vocab_size, num_steps, epochs, val_steps)

52 10000 594964 2500 2 312


In [16]:
# SHUFFLE AND BATCH

train_batch = train_ds.shuffle(buffer_size).batch(batch_size)
valid_batch = valid_ds.shuffle(buffer_size).batch(batch_size)
train_prefetch = train_batch.prefetch(buffer_size=tf.data.AUTOTUNE) # prefetch speeds up training
valid_prefetch = valid_batch.prefetch(buffer_size=tf.data.AUTOTUNE)

valid_batch

<BatchDataset shapes: ((None, 52), (None, 3)), types: (tf.uint16, tf.uint8)>

In [None]:
# show_batch(valid_prefetch) # Note: determining <unk> words is a bottleneck

In [None]:
# Test Model
model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, input_length=maxlength),
    Conv1D(filters=16, kernel_size=3, padding='valid'),
    MaxPool1D(),
#   Dense(32,activation='relu'),
    Bidirectional(GRU(embedding_dim//2, return_sequences=True)), # embedding_dim//2
    Bidirectional(GRU(embedding_dim//2, return_sequences=True)),
    Bidirectional(GRU(embedding_dim//2, return_sequences=False)),
    Dropout(0.2),
    Dense(1, activation='sigmoid') #3
])

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy', # categorical_crossentropy for multilabel classification
              metrics=['accuracy'])

In [None]:
history = model.fit(
    train_prefetch,
    epochs=epochs,
    validation_data=valid_prefetch, 
    validation_steps=val_steps,
    steps_per_epoch=buffer_size,
    callbacks = [] # [tf.keras.callbacks.ReduceLROnPlateau()] # cp_callback not used
    )

In [53]:
class Linear(Layer):
    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        w_init = tf.random_normal_initializer()
        self.w = tf.Variable(
            initial_value=w_init(shape=(input_dim, units), dtype="float32"),
            trainable=True,
        )
        b_init = tf.zeros_initializer()
        self.b = tf.Variable(
            initial_value=b_init(shape=(units,), dtype="float32"), trainable=True
        )

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [46]:
# Test Model
model_2 = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, input_length=maxlength),
    Bidirectional(LSTM(64,  return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

In [42]:
# model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#               optimizer=tf.keras.optimizers.Adam(1e-4),
#               metrics=['accuracy'])

model_2.compile(optimizer='adam',
              loss='categorical_crossentropy', # categorical_crossentropy for multilabel classification
              metrics=['accuracy'])

In [44]:
history_2 = model_2.fit(
    train_prefetch,
    epochs=epochs,
    validation_data=valid_prefetch,
    validation_steps=val_steps,
    steps_per_epoch=buffer_size
    )

Epoch 1/2
Epoch 2/2


In [48]:
tv = model_2.trainable_variables
tv

[<tf.Variable 'embedding_7/embeddings:0' shape=(10000, 64) dtype=float32, numpy=
 array([[-0.02485519,  0.04324025, -0.00568551, ..., -0.00574601,
         -0.04451542, -0.0130531 ],
        [ 0.02046304,  0.00856596,  0.00592808, ..., -0.02653407,
          0.02381715, -0.02713521],
        [ 0.0463605 , -0.0442538 , -0.02002962, ...,  0.00202338,
          0.02146444,  0.03020327],
        ...,
        [ 0.01855123,  0.02055471,  0.04204352, ...,  0.02218182,
         -0.00423261, -0.04469874],
        [ 0.00067336, -0.03011259, -0.02247182, ..., -0.03586928,
         -0.0445512 ,  0.02848769],
        [ 0.00945091,  0.02293407,  0.03152895, ..., -0.02468808,
         -0.02930619,  0.04416488]], dtype=float32)>,
 <tf.Variable 'bidirectional_14/forward_lstm_14/lstm_cell_43/kernel:0' shape=(64, 256) dtype=float32, numpy=
 array([[ 0.10738212,  0.09106629, -0.12615612, ...,  0.01291603,
         -0.03119817,  0.08497344],
        [ 0.11975107,  0.0686501 , -0.06333651, ..., -0.03790931,

In [51]:
# Check whether the manual weight modifications work-
for layer in model_2.layers:
    print(type(layer))
#     print(layer.numpy())

<class 'tensorflow.python.keras.layers.embeddings.Embedding'>
<class 'tensorflow.python.keras.layers.wrappers.Bidirectional'>
<class 'tensorflow.python.keras.layers.wrappers.Bidirectional'>
<class 'tensorflow.python.keras.layers.core.Dense'>
<class 'tensorflow.python.keras.layers.core.Dropout'>
<class 'tensorflow.python.keras.layers.core.Dense'>


In [None]:
model_2.evaluate(valid_prefetch)

### Preparing Test Data

In [28]:
testdata_df = pd.read_csv("dataset/testdata.manual.2009.06.14.csv", usecols=[0,5], header=None, names=['label','text'],encoding='latin-1')

In [29]:
# Classfication: Covert all labels {negative = 0, neutral = 1,positive = 2}
testdata_df.loc[testdata_df['label'] == 2, 'label'] = 1
testdata_df.loc[testdata_df['label'] == 4, 'label'] = 2

In [30]:
testdata_df.tail(100)

Unnamed: 0,label,text
398,0,@sportsguy33 Time Warner = epic fail
399,1,Lawson to head Newedge Hong Kong http://bit.ly...
400,1,Weird Piano Guitar House in China! http://u2s....
401,1,Send us your GM/Chevy photos http://tinyurl.co...
402,0,I know. How sad is that? RT @caseymercier: 1s...
...,...,...
493,1,Ask Programming: LaTeX or InDesign?: submitted...
494,0,"On that note, I hate Word. I hate Pages. I hat..."
495,2,Ahhh... back in a *real* text editing environm...
496,0,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [31]:
test_text = testdata_df['text'].to_numpy()
test_seqs = tok.texts_to_sequences(test_text)
test_seqs = tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=maxlength, padding='post')

test_labels = testdata_df['label'].to_numpy()
test_labels = to_categorical(test_labels, num_classes=3, dtype='uint8')

In [32]:
test_text[488]

'@johncmayer is Bobby Flay joining you?'

In [33]:
vec2word_no_pad(test_seqs[488])

'johncmayer is bobby <unk> joining you'

In [34]:
y_pred = np.argmax(model_2.predict(test_seqs), axis=-1)

In [35]:
y_pred

array([2, 2, 2, 0, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2,
       0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0,
       0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2,
       0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0,
       0, 0, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 2, 2,
       2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0,
       2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 0, 0, 2, 2, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2,

In [None]:
for x in range(498):
    print("IDX: {}, ANS:{}, PREDICT:{}".format(x, test_labels[x], rs[x]))

In [None]:
# PREDICTION FUNCTIONS

# maxlength = 118

# def sentiment(num):
#     "Converts a float into the corresponding sentiment label"
#     if num < 0.40: return 'negative'
#     if num > 0.60: return 'positive'
#     return 'neutral'

def sentiment(num):
    "Converts a float into the corresponding sentiment label"
    if num < 0.40: return 0
    if num > 0.60: return 4
    return 2


def give_sentiment(sent):
    "Prints given sentences with their predicted sentiments"
    if isinstance(sent, str):
        spaces = len(str)
        s = word2vec([sent])
        s = tf.keras.preprocessing.sequence.pad_sequences(s, maxlen=maxlength, padding='post').astype('uint16')
        val = model_2.predict(s)[0]
        res = sentiment(val)
        print("\n")
        print(f"{sent}", " "*(5+spaces-len(sent)), "|", " "*10, f" {res} ({val[0]:.2f})")
        print("\n")
        
    elif isinstance(sent, typing.Iterable):
        spaces = max([len(i) for i in sent])
        s = word2vec(sent)
        s = tf.keras.preprocessing.sequence.pad_sequences(s, maxlen=maxlength, padding='post').astype('uint16')
        vals = [i for i in model_2.predict(s)]
        res = [sentiment(i[0]) for i in vals]
        
        print("\n")
        for (sentence, result, val) in zip(sent, res, vals):

            print(f"{sentence}", " "*(spaces-len(sentence)), "|", " "*4, f" {result}  ({val[0]:.2f})")
        print("\n")
        
    else:
        raise TypeError
    
    return

In [None]:
phrases = ["Ask Programming LaTeX or InDesign submitted by calcio1"]

give_sentiment(phrases)

In [None]:
pred_list = []
for x in rs:
    pred_list.append(sentiment(x))

In [None]:
count = 0
for x in range(len(pred_list)):
    if pred_list[x] == test_labels[x]:
        count = count + 1

count / len(pred_list)