### Reference:
https://ithelp.ithome.com.tw/articles/10223922
<br>
https://github.com/johngilbert2000/sentiment140_with_fastai/blob/master/TF_NLP_sentiment140.ipynb

In [1]:
import numpy as np
import pandas as pd
import nltk

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras.layers import *

In [2]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

### Loading Training Data from csv file

In [3]:
df = pd.read_csv("dataset/training.1600000.processed.noemoticon.csv", usecols=[0,5], header=None, names=['label','text'],encoding='latin-1')

In [4]:
df.tail(5)

Unnamed: 0,label,text
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,happy #charitytuesday @theNSPCC @SparksCharity...


In [5]:
df.label.value_counts()

0    800000
4    800000
Name: label, dtype: int64

In [6]:
# PREPARE DATA

# Randomize data
df = df.iloc[np.random.permutation(len(df))]

# For Binary Classification: Convert Labels to 0 and 1 
df.loc[df['label'] == 4, 'label'] = 1

df.head()

Unnamed: 0,label,text
587224,0,@peterfacinelli ok so i'm seeing some good num...
1216596,1,I love this song at the mo'! And Serge just pl...
1131339,1,".@PsycheDiver come have some beers, we'll be c..."
844469,1,@kar3nx28 I'm sure it is Miss you... Fortunat...
1262352,1,made own recipe for chicken-pork adobo. wonder...


In [7]:
# CREATE SUBSETS
splits = 5
# Equal length subsets of original dataframe
cut_indices = [int(i*(1/splits)*len(df)) for i in range(0,splits+1)] # indices where df is to be cut
segment_indices = zip(cut_indices[:-1], cut_indices[1:]) # indices for each cut segment
valids = [df[begin:end] for begin,end in segment_indices] # subsets each to be used as validation sets

# Training sets for each validation set in valids
trains = [pd.concat(valids[1:], axis=0)]
for n in range(1,splits):
    trains += [pd.concat(valids[:n]+valids[n+1:], axis=0)] # (all sets except for set n in range(splits))

# Validation and Training Sets to be used
validation = valids[0]
training = trains[0]

In [8]:
# a mean close to 0.5 for labels indicates a well balanced dataset
validation['label'].describe()['mean']

0.5002

In [9]:
# VECTORIZE DATASET

vocab_size = 5000 # 10000

text = training['text'].to_numpy()

tok = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token='<unk>')

tok.fit_on_texts(text)

tok.word_index['<pad>'] = 0
tok.index_word[0] = '<pad>'

# pad vectors to maxlength
train_text = training['text'].to_numpy()
train_seqs = tok.texts_to_sequences(train_text)
maxlength = max(len(i) for i in train_seqs)
train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=maxlength, padding='post')

train_labels = training['label'].to_numpy().flatten()

# pad vectors to maxlength (don't calculate automatically, or valid set will be of different size)
valid_text = validation['text'].to_numpy()
valid_seqs = tok.texts_to_sequences(valid_text)
valid_seqs = tf.keras.preprocessing.sequence.pad_sequences(valid_seqs, maxlen=maxlength, padding='post')

valid_labels = validation['label'].to_numpy().flatten()

# Use lowest possible types to speed up training
train_seqs = train_seqs.astype('uint16')
valid_seqs = valid_seqs.astype('uint16')
train_labels = train_labels.astype('bool')
valid_labels = valid_labels.astype('bool') # uint8 or bool?

# Convert to TF dataset format
train_ds = tf.data.Dataset.from_tensor_slices((train_seqs,train_labels))
valid_ds = tf.data.Dataset.from_tensor_slices((valid_seqs,valid_labels))

print(maxlength)

52


In [10]:
import typing
import numbers
import os
import unittest
import random
import copy

In [11]:
# USEFUL FUNCTIONS

def word2vec(words):
    if isinstance(words, str):
        "Takes a string of words and returns a list of corresponding integers"
        seq = tok.texts_to_sequences([words])
        return np.array(seq).flatten().tolist()
    elif isinstance(words, typing.Iterable):
        "Takes a list of strings and returns a list of sequences (lists of corresponding integers)"
        return tok.texts_to_sequences(words)
    else:
        raise ValueError(f'Words were of type {type(words)} but should be either a string or list of strings')
        

def vec2word(vec:typing.Iterable[typing.Any]):
    if isinstance(vec[0], numbers.Number):
        "Takes a list of ints and returns a corresponding string"
        return " ".join([list(tok.word_index.keys())[i-1] for i in vec])
    elif isinstance(vec[0], typing.Iterable):
        "Takes an array of sequences (i.e., a 2d array) and returns an array of strings"
        return [vec2word(i) for i in vec]
    else:
        raise ValueError( f'Input list should contain either ints or lists of ints, not {type(vec[0])}')

def vec2word_no_pad(vec:typing.Iterable[typing.Any]):
    "Removes padding and converts vectors of ints to strings"
    if isinstance(vec[0], numbers.Number):
        "Takes a list of ints and returns a corresponding string"
        return " ".join([list(tok.word_index.keys())[i-1] for i in vec if i != 0])
    elif isinstance(vec[0], typing.Iterable):
        "Takes an array of sequences (i.e., a 2d array) and returns an array of strings"
        return [vec2word(i) for i in vec if i != 0]
    else:
        raise ValueError( f'Input list should contain either ints or lists of ints, not {type(vec[0])}')

def show_batch(ds):
    "Takes a tensorflow dataset and returns a batch as a dataframe, with labels shown without padding"
    batch_vecs, batch_targets = next(iter(ds.batch(1)))  # iterate through dataset batches
    batch_vecs, batch_targets = np.array(batch_vecs)[0], np.array(batch_targets)[0]  # convert tf batch to np array & reduce dimension by 1
    return pd.DataFrame(zip(batch_vecs, [vec2word_no_pad(arr) for arr in batch_vecs], batch_targets), columns=['word_vec','text','target'])

In [12]:
[32*i for i in range(1,10)] # Choose a multiple of 32 for embedding dimension

[32, 64, 96, 128, 160, 192, 224, 256, 288]

In [13]:
# TRAINING PARAMETERS

# Calculates the max_length, which can be used to store the attention weights
maxlength = max(len(i) for i in train_seqs)
total_vocab_size = len(tok.word_index) # no need to add +1, word_index includes <pad>
batch_size = 512 # 256 # 128 # 64
buffer_size = 1000 # 500 # 1000
embedding_dim = 64 # 32 # 64 # 128 # 256
num_steps = len(train_text) // batch_size
epochs = num_steps // buffer_size
val_steps = len(valid_seqs) // batch_size // epochs
learning_rate = 0.001 * 8

print(maxlength, vocab_size, total_vocab_size, num_steps, epochs, val_steps)

52 5000 595023 2500 2 312


In [14]:
# SHUFFLE AND BATCH

train_batch = train_ds.shuffle(buffer_size).batch(batch_size)
valid_batch = valid_ds.shuffle(buffer_size).batch(batch_size)
train_prefetch = train_batch.prefetch(buffer_size=tf.data.AUTOTUNE) # prefetch speeds up training
valid_prefetch = valid_batch.prefetch(buffer_size=tf.data.AUTOTUNE)

valid_batch

<BatchDataset shapes: ((None, 52), (None,)), types: (tf.uint16, tf.bool)>

In [15]:
# show_batch(valid_prefetch) # Note: determining <unk> words is a bottleneck

In [None]:
# Test Model
model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, input_length=maxlength),
    Conv1D(filters=16, kernel_size=3, padding='valid'),
    MaxPool1D(),
#   Dense(32,activation='relu'),
    Bidirectional(GRU(embedding_dim//2, return_sequences=True)), # embedding_dim//2
    Bidirectional(GRU(embedding_dim//2, return_sequences=True)),
    Bidirectional(GRU(embedding_dim//2, return_sequences=False)),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy', # categorical_crossentropy for multilabel classification
              metrics=['accuracy'])

In [None]:
history = model.fit(
    train_prefetch,
    epochs=epochs,
    validation_data=valid_prefetch, 
    validation_steps=val_steps,
    steps_per_epoch=buffer_size,
    callbacks = [] # [tf.keras.callbacks.ReduceLROnPlateau()] # cp_callback not used
    )

In [21]:
# Test Model
model_2 = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, input_length=maxlength),
    Bidirectional(LSTM(64,  return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [22]:
model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [25]:
history_2 = model_2.fit(
    train_prefetch,
    epochs=epochs,
    validation_data=valid_prefetch,
    validation_steps=val_steps,
    steps_per_epoch=buffer_size
    )

Epoch 1/2
Epoch 2/2


### Preparing Test Data

In [27]:
testdata_df = pd.read_csv("dataset/testdata.manual.2009.06.14.csv", usecols=[0,5], header=None, names=['label','text'],encoding='latin-1')

In [39]:
testdata_df.head(100)

Unnamed: 0,label,text
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...
...,...,...
95,4,omgg i ohhdee want mcdonalds damn i wonder if ...
96,0,History exam studying ugh
97,0,"I hate revision, it's so boring! I am totally ..."
98,0,"Higher physics exam tommorow, not lookin forwa..."


In [61]:
test_text = testdata_df['text'].to_numpy()
test_seqs = tok.texts_to_sequences(test_text)
test_seqs = tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=maxlength, padding='post')

test_labels = testdata_df['label'].to_numpy().flatten()

In [77]:
test_text[488]

'@johncmayer is Bobby Flay joining you?'

In [76]:
vec2word_no_pad(test_seqs[488])

'johncmayer is bobby <unk> joining you'

In [65]:
rs = model_2.predict(test_seqs)

In [75]:
for x in range(498):
    print("IDX: {}, ANS:{}, PREDICT:{}".format(x, test_labels[x], rs[x]))

IDX: 0, ANS:4, PREDICT:[0.54160094]
IDX: 1, ANS:4, PREDICT:[0.978784]
IDX: 2, ANS:4, PREDICT:[0.9341502]
IDX: 3, ANS:4, PREDICT:[0.2791358]
IDX: 4, ANS:4, PREDICT:[0.56116503]
IDX: 5, ANS:4, PREDICT:[0.7531528]
IDX: 6, ANS:0, PREDICT:[0.10922182]
IDX: 7, ANS:4, PREDICT:[0.9677968]
IDX: 8, ANS:4, PREDICT:[0.9761795]
IDX: 9, ANS:4, PREDICT:[0.6300822]
IDX: 10, ANS:2, PREDICT:[0.9794765]
IDX: 11, ANS:0, PREDICT:[0.29010424]
IDX: 12, ANS:4, PREDICT:[0.6272885]
IDX: 13, ANS:4, PREDICT:[0.8595648]
IDX: 14, ANS:0, PREDICT:[0.3777945]
IDX: 15, ANS:4, PREDICT:[0.34579957]
IDX: 16, ANS:0, PREDICT:[0.12502903]
IDX: 17, ANS:4, PREDICT:[0.8277398]
IDX: 18, ANS:0, PREDICT:[0.30725393]
IDX: 19, ANS:4, PREDICT:[0.3734501]
IDX: 20, ANS:4, PREDICT:[0.78710324]
IDX: 21, ANS:4, PREDICT:[0.8173079]
IDX: 22, ANS:4, PREDICT:[0.1493432]
IDX: 23, ANS:4, PREDICT:[0.6178741]
IDX: 24, ANS:4, PREDICT:[0.5313508]
IDX: 25, ANS:4, PREDICT:[0.9774164]
IDX: 26, ANS:4, PREDICT:[0.5673028]
IDX: 27, ANS:4, PREDICT:[0.6777

In [83]:
# PREDICTION FUNCTIONS

# maxlength = 118

# def sentiment(num):
#     "Converts a float into the corresponding sentiment label"
#     if num < 0.40: return 'negative'
#     if num > 0.60: return 'positive'
#     return 'neutral'

def sentiment(num):
    "Converts a float into the corresponding sentiment label"
    if num < 0.5: return 0
    if num >= 0.5: return 4
    return 2


def give_sentiment(sent):
    "Prints given sentences with their predicted sentiments"
    if isinstance(sent, str):
        spaces = len(str)
        s = word2vec([sent])
        s = tf.keras.preprocessing.sequence.pad_sequences(s, maxlen=maxlength, padding='post').astype('uint16')
        val = model_2.predict(s)[0]
        res = sentiment(val)
        print("\n")
        print(f"{sent}", " "*(5+spaces-len(sent)), "|", " "*10, f" {res} ({val[0]:.2f})")
        print("\n")
        
    elif isinstance(sent, typing.Iterable):
        spaces = max([len(i) for i in sent])
        s = word2vec(sent)
        s = tf.keras.preprocessing.sequence.pad_sequences(s, maxlen=maxlength, padding='post').astype('uint16')
        vals = [i for i in model_2.predict(s)]
        res = [sentiment(i[0]) for i in vals]
        
        print("\n")
        for (sentence, result, val) in zip(sent, res, vals):

            print(f"{sentence}", " "*(spaces-len(sentence)), "|", " "*4, f" {result}  ({val[0]:.2f})")
        print("\n")
        
    else:
        raise TypeError
    
    return

In [46]:
phrases = ["Ask Programming LaTeX or InDesign submitted by calcio1"]

give_sentiment(phrases)



Ask Programming LaTeX or InDesign submitted by calcio1  |       positive  (0.83)




In [84]:
pred_list = []
for x in rs:
    pred_list.append(sentiment(x))

In [85]:
count = 0
total = 0
for x in range(len(pred_list)):
    if test_labels[x] == 2:
        continue
    else:
        total = total +1
    if pred_list[x] == test_labels[x]:
        count = count + 1

count / total

0.807799442896936