This is the accompanying IPython notebook for the medium article **A simple deep neural network that beats TextBlob and VADER packages for sentiment classifications, written in Python.**

There are explainations for every code block. More detailed explaination can be found in the original Medium article.

In [None]:
import os 
import random as rnd
import re
import string

# import relevant libraries
!pip install -q -U trax
import trax

# import trax.fastmath.numpy
import trax.fastmath.numpy as np # the same a Jax
from trax import fastmath

# import trax.layers
from trax import layers as tl

[K     |████████████████████████████████| 637 kB 6.9 MB/s 
[K     |████████████████████████████████| 4.4 MB 55.3 MB/s 
[K     |████████████████████████████████| 458.3 MB 13 kB/s 
[K     |████████████████████████████████| 5.6 MB 54.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 49.7 MB/s 
[K     |████████████████████████████████| 462 kB 66.3 MB/s 
[?25h  Building wheel for clang (setup.py) ... [?25l[?25hdone
  Building wheel for wrapt (setup.py) ... [?25l[?25hdone


# Construct a vocabulary dictionary, this is a simple model, to convert words into numerical vectors. nltk Twitter sample will be used to contruct the vocabulary dictionary in this case

### Download labelled nltk twitter samples

In [None]:
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
from nltk.corpus import stopwords, twitter_samples 
from nltk.tokenize import TweetTokenizer
stopwords_english = stopwords.words('english')
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Helper functions to clean twitter data

In [None]:
def load_tweets():
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')  
    return all_positive_tweets, all_negative_tweets
    
def process_tweet(tweet):
    '''
    Input: 
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    
    '''
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    ### START CODE HERE ###
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
            word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
    ### END CODE HERE ###
    return tweets_clean


### Construct vocabulary dictionary using both the downloaded data and the helper funciton

In [None]:
# Load positive and negative tweets
all_positive_tweets, all_negative_tweets = load_tweets()

train_pos  = all_positive_tweets[:4000]# generating training set for positive tweets
train_neg  = all_negative_tweets[:4000] # generating training set for nagative tweets

# Combine training data into one set
train_x = train_pos + train_neg 

# Build the vocabulary
# Unit Test Note - There is no test set here only train/val

# Include special tokens 
# started with pad, end of line and unk tokens
Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

# Note that we build vocab using training data
for tweet in train_x: 
    processed_tweet = process_tweet(tweet)
    for word in processed_tweet:
        if word not in Vocab: 
            Vocab[word] = len(Vocab)


# Model description

In [None]:
def classifier(vocab_size=10000, embedding_dim=256, output_dim=2, mode='train'):
    # create embedding layer
    embed_layer = tl.Embedding(
        vocab_size=vocab_size, # Size of the vocabulary
        d_feature=embedding_dim)  # Embedding dimension
    
    # Create a mean layer, to create an "average" word embedding
    mean_layer = tl.Mean(axis=1)
    
    # Create a dense layer, one unit for each output
    dense_output_layer = tl.Dense(n_units = output_dim)

    # Create the log softmax layer (no parameters needed)
    log_softmax_layer = tl.LogSoftmax()
    
    # Use tl.Serial combinator
    model = tl.Serial(
      embed_layer, # embedding layer
      mean_layer, # mean layer
      dense_output_layer, # dense output layer 
      log_softmax_layer # log softmax layer
    )
    
    # return the model of type
    return model

### I will provide the trained weights of this model. The details of the model is as below. You are welcome to re-train this model to fit your needs if needed.

In [None]:
model = classifier()
display(model)

Serial[
  Embedding_10000_256
  Mean
  Dense_2
  LogSoftmax
]

### Helper function for both training and predicting

In [None]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
    '''
    Input: 
        tweet - A string containing a tweet
        vocab_dict - The words dictionary
        unk_token - The special string for unknown tokens
        verbose - Print info durign runtime
    Output:
        tensor_l - A python list with
        
    '''  
    
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    # Process the tweet into a list of words
    # where only important words are kept (stop words removed)
    word_l = process_tweet(tweet)
    
    if verbose:
        print("List of words from the processed tweet:")
        print(word_l)
        
    # Initialize the list that will contain the unique integer IDs of each word
    tensor_l = []
    
    # Get the unique integer ID of the __UNK__ token
    unk_ID = vocab_dict[unk_token]
    
    if verbose:
        print(f"The unique integer ID for the unk_token is {unk_ID}")
        
    # for each word in the list:
    for word in word_l:
        
        # Get the unique integer ID.
        # If the word doesn't exist in the vocab dictionary,
        # use the unique ID for __UNK__ instead.
        word_ID = vocab_dict[word] if word in vocab_dict else unk_ID
    ### END CODE HERE ###
        
        # Append the unique integer ID to the tensor list.
        tensor_l.append(word_ID) 
    
    return tensor_l

In [None]:
help(model.init_from_file)

Help on method init_from_file in module trax.layers.base:

init_from_file(file_name, weights_only=False, input_signature=None) method of trax.layers.combinators.Serial instance
    Initializes this layer and its sublayers from a pickled checkpoint.
    
    In the common case (`weights_only=False`), the file must be a gziped pickled
    dictionary containing items with keys `'flat_weights', `'flat_state'` and
    `'input_signature'`, which are used to initialize this layer.
    If `input_signature` is specified, it's used instead of the one in the file.
    If `weights_only` is `True`, the dictionary does not need to have the
    `'flat_state'` item and the state it not restored either.
    
    Args:
      file_name: Name/path of the pickled weights/state file.
      weights_only: If `True`, initialize only the layer's weights. Else
          initialize both weights and state.
      input_signature: Input signature to be used instead of the one from file.
    
    Returns:
      A `(w

The path below should be where you saved the pre-trained weights. I have pre-trained this DNN and you can find my saved weights here, https://github.com/dingkaihua/A-simple-deep-neural-network-that-beats-TextBlob-and-VADER-packages-for-sentiment-classifications/blob/main/checkpoints/model.pkl.gz

Simply download the model.pkl.gz and save it somewhere. Then, give model.pkl.gz's path to PATH variable below.

In [None]:
PATH = YOUR_OWN_PATH # for me, it was '/content/drive/MyDrive/Colab_Notebooks/Medium/checkpoints/model.pkl.gz'
weights, state = model.init_from_file(PATH)

In [None]:
### Helper function to call model directly for prediciton

In [None]:
def predict(sentence):
    inputs = np.array(tweet_to_tensor(sentence, vocab_dict=Vocab))
    
    # Batch size 1, add dimension for batch, to work with the model
    inputs = inputs[None, :]  
    
    # predict with the model
    preds_probs = model(inputs) # log softmax result
    
    # Turn probabilities into categories
    preds = int(preds_probs[0, 1] > preds_probs[0, 0])
    
    sentiment = "negative"
    if preds == 1:
        sentiment = 'positive'

    return preds, sentiment

# Test: classification accuracy comparison among TextBlob, VADER and a regular deep neural networks 

In [None]:
from textblob import TextBlob
import nltk
nltk.download('vader_lexicon')
!pip3 install -U nltk[twitter] 
from nltk.sentiment.vader import SentimentIntensityAnalyzer # ask VADER to use Twitter lexicon for fairness of comparison

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
Collecting twython
  Downloading twython-3.9.1-py3-none-any.whl (33 kB)
Installing collected packages: twython
Successfully installed twython-3.9.1


In [None]:
# For convinence, I wote this wrapper to call both TextBlob and VADER
def analize_sentiment(sentence, option='VADER'):
    '''
    Utility function to classify the polarity of a tweet
    using textblob.
    '''

    if option == 'VADER':
      analysis = SentimentIntensityAnalyzer().polarity_scores(sentence)
      analysis = analysis['compound'] # take the compound score
    elif option == "TextBlob":
      analysis = TextBlob(tweet)
      analysis = analysis.sentiment.polarity

    if analysis > 0:
        return "positive"
    elif analysis == 0:
        return "neutral"
    else:
        return "negative"

### Kai hand engineered the following tests set, consist of negation, double negations and use of idioms.

In [None]:
hand_engineered_tests= [["The movie is almost good.", "negative"], # negation
                       ["I don't think the movie is good.", "negative"], # negation
                       ["This movie is ridiculously underrated. This movie can not be any better!", "positive"], # double negation
                       ["The movie is not bad.", "positive"], # double negation
                       ["I can't believe how bad this movie is.", "negative"], # expression that contains negation as exclaimation
                       ["I can't believe how great this movie is.", "positive"], # expression that contains negation as exclaimation
                       ["This movie is ridiculously great.", "positive"],
                       ["This movie is ridiculously horrible.", "negative"],
                       ["One of the worst film that I have seen in my life.", "negative"],
                       ["This is about as entertaining as watching paint dry.", "negative"],  # usage of idiom
                       ["Not a good choice.", "negative"],
                       ["Wasted 2 hours of my life on this moive that I can never get back.", "negative"]] # scarsm

In [None]:
from tabulate import tabulate
print(tabulate(hand_engineered_tests, headers=['Hand engineered tweet', 'Sentiment'], tablefmt='orgtbl'))

| Hand engineered tweet                                                    | Sentiment   |
|--------------------------------------------------------------------------+-------------|
| The movie is almost good.                                                | negative    |
| I don't think the movie is good.                                         | negative    |
| This movie is ridiculously underrated. This movie can not be any better! | positive    |
| The movie is not bad.                                                    | positive    |
| I can't believe how bad this movie is.                                   | negative    |
| I can't believe how great this movie is.                                 | positive    |
| This movie is ridiculously great.                                        | positive    |
| This movie is ridiculously horrible.                                     | negative    |
| One of the worst film that I have seen in my life.                       | negative    |

In [None]:
no_hand_engineered_tests = len(hand_engineered_tests)

no_correct_classification_TextBlob = 0
no_correct_classification_VADER = 0
no_correct_classification_deep_neural_nets = 0


for i in range(no_hand_engineered_tests):

    sample = hand_engineered_tests[i]
    sentence = sample[0]
    sentiment = sample[1]

    # TextBlob
    if analize_sentiment(sentence, option='TextBlob') == sentiment:
      no_correct_classification_TextBlob+=1

    # VADER
    if analize_sentiment(sentence, option='VADER') == sentiment:
      no_correct_classification_VADER+=1

    # deep neural nets
    if predict(sentence)[1] == sentiment:
      no_correct_classification_deep_neural_nets +=1

    
print(f"TextBlob classified {no_correct_classification_TextBlob} / {no_hand_engineered_tests} correctly. \n")
print(f"VADER classified {no_correct_classification_VADER} / {no_hand_engineered_tests} correctly. \n")
print(f"The deep neural net classified {no_correct_classification_deep_neural_nets} / {no_hand_engineered_tests} correctly.\n")



TextBlob classified 8 / 12 correctly. 

VADER classified 6 / 12 correctly. 

The deep neural nets classified 10 / 12 correctly.



In [None]:
# try a negative sentence
sentence = "I can not believe how fantastic this movie was."
pred, sentiment = predict(sentence)
print(f"The deep neural net classifies sentiment of the sentence: '{sentence}', to be {sentiment}.")
print(f"TextBlot classifies sentiment of the sentence: '{sentence}', to be {analize_sentiment(sentence, option='TextBlob')}.")
print(f"VADER classifies sentiment of the sentence: '{sentence}', to be {analize_sentiment(sentence, option='VADER')}.")

The deep neural net classifies sentiment of the sentence: 'I can not believe how fantastic this movie was.', to be positive.
TextBlot classifies sentiment of the sentence: 'I can not believe how fantastic this movie was.', to be negative.
VADER classifies sentiment of the sentence: 'I can not believe how fantastic this movie was.', to be negative.
