In [1]:
import sentiment

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

import random
import os
import pandas as pd

import spacy
nlp = spacy.load('en')



First, we prepare the paths to the serialized, trained models.

In [2]:
modeldir = os.path.join('output', 'models')
IMDB_googlenews_netstatename = 'IMDB-googlenewsembeddings-fasttext-epoch3-fullstate.pth'
IMDB_glove_netstatename = 'IMDB-fasttext-epoch4-fullstate.pth'

trained_IMDB_googlenews_fasttext = os.path.join(modeldir, 'full_state', IMDB_googlenews_netstatename)
trained_IMDB_wiki_fasttext = os.path.join(modeldir, 'full_state', IMDB_glove_netstatename)

In this demo, we'll be using IMDB movie reviews to predict sentiment. The other data set we need is one that describes the relationships between words, also called word vectors or word embeddings.

Sentiment data is easy enough to understand, it's any body of text and its associated label. Let's talk a bit about word embeddings.
* * *

Word embeddings essentially are a big table, with each row corresponding to a word in your vocabulary. So, the larger the number of embeddings you have, the bigger your vocabulary. If a word is not available in your vocabulary, when factoring it into the categorization we give it a null value like an <unk\> token, to indicate that we don't know its relationship to other words.

The columns of the word embeddings are the dimensions of the words. The words are 'embdedded' into latent features in the form of real numbers, so that their relationship to one another can be described in dimensional space. There are multiple ways to do this, such as by frequency of co-usage in a corpus of text. GloVe, which stands for Global Vectors for Word Representation, is one such way, created by the Stanford University NLP group. word2vec is another way, created by Google. In this demo, we'll try using each of them.
* * *

It's important to understand the dimensionality of the word embeddings we're using, as the model needs to be initialize to accept the correct dimensions. Also, you should use the same word embeddings for training and for categorization, or else the prediction won't make sense, since the words in the word embeddings would be different in type and sequence.

In this demo, we'll use the word embeddings from GloVe that are available as a predefined option from torchtext. We'll use the version with 6 billion words and 100 dimensions, but only take the most frequently used 25,000 words (the GloVe word embeddings are sorted in order of frequency of use). This set of embeddings was generated from a corpus of WIkipeduia 2014 and Gigaword 5 text. For more information, see https://nlp.stanford.edu/projects/glove/

When demonstrating how to use your own word embeddings, we'll load the Google News word embeddings that have a vocabulary of 3 million words and 300 dimensions. When using non-predefined word embeddings, it's difficult to adjust the dimensionality after loading them, so you should ensure that they're in the correct shape before you load them.

In [3]:
# Loading IMDB dataset

"""
We can more accurately represent the relationship between words by looking at them as phrases. 
For example, the bigram "new york" carries a lot more meaning than the two words "new" and "york".
When applying the GloVe model, we'll look at words as bigrams. 
I'm not entirely sure yet how to do this for word embeddings provided via gensim, so we won't do that for now.
"""
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

TEXT = data.Field(tokenize='spacy', preprocessing=generate_bigrams)
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

# This model was trained on a training set obtained with this seed, 
# so we'll reuse it to ensure that we have a distinct test set
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

# We'll split the dataset, including the labels, for model evaluation later on
train, test = datasets.IMDB.splits(TEXT, LABEL)

train, valid = train.split(random_state=random.seed(SEED))

TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

# We use iterators for consuming the data during each phase. Let's initialize them now together with the data set
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

Now, we have to decide which of the word embeddings we want to use. Only run one of the following two cells.

In [4]:
"""
If we want to use the GloVe word embeddings, we can get them from the TEXT object we just built.
"""
weights = TEXT.vocab.vectors

In [14]:
"""
If we want to use the word2vec word embeddings, we have to load them using gensim.
"""

import gensim
googlenews_kv = gensim.models.KeyedVectors.load_word2vec_format(os.path.join('input', 'word2vec', 'GoogleNews-vectors-negative300.bin'), binary=True)
weights = torch.FloatTensor(googlenews_kv.syn0)

  import sys


Gensim keyed vectors allow you to do many other interesting things, such as find similar words of various kinds, 
including the famous queen = woman + king - man example. For more information, see https://radimrehurek.com/gensim/models/keyedvectors.html
***
Now, we're finally ready to initialize the model. We just have to provide the serialized, trained model, and the corresponding set of word embeddings. Run only one of the following two cells.

In [5]:
model = sentiment.load_fasttext(trained_IMDB_wiki_fasttext, weights)

FastText(
  (embedding): Embedding(25002, 100)
  (fc): Linear(in_features=100, out_features=1, bias=True)
)


In [15]:
model = sentiment.load_fasttext(trained_IMDB_googlenews_fasttext, weights)

FastText(
  (embedding): Embedding(3000000, 300)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


load_fasttext() initializes a FastText model from a full state file, which contains information about the epoch that training stopped at(how many cycles of training we did), the state of the optimizer at that epoch(so that we can resume training for more epoch if we want), and the state dictionary of the model, which is essentially the parameters the model uses to categorize sentiment. In this scenario, we're only using the last of those three. 

load_fasttext() also copies the word embeddings we've loaded into the model.
***

At this point, we're almost ready to use the model! We just need to define a couple helper functions to predict sentiment using the model we've loaded.

torchtext .vocab objects have a stoi attribute that allows us to retrieve the index of a word in the vocabulary, i.e. convert the string to index. 

However, gensim's equivalent doesn't automatically assign a 0 when the index cannot be found, so we just have to make a simple function to do that.

In [6]:
def gensim_stoi(word):
    try:
        return googlenews_kv.vocab[word].index
    except KeyError:
        return 0

In [7]:
def predict_sentiment (sentence, keyedvector=False):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = []
    if keyedvector:
        indexed = [gensim_stoi(t) for t in tokenized]
    else:
        indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    print(indexed)
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model(tensor))
    return prediction.item()

predict_sentiment() simply takes a sentence for which we need the sentiment category, converts it from a list of words to a list of word indices, and passes that list to the model. Now we're ready to use it to interprete sentences!

In [16]:
predict_sentiment('Twitter announces ban on cryptocurrency ads')

[0, 0, 0, 28, 0, 23180]




6.239748131520173e-07

But how accurate is our model actually? We can evaluate this by applying it to our training and test data sets.

In [17]:
criterion = nn.BCEWithLogitsLoss()

train_loss, train_acc = sentiment.evaluate(model, train_iterator, criterion)
test_loss, test_acc = sentiment.evaluate(model, test_iterator, criterion)

print(f'Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%')
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Train Loss: 0.350, Train Acc: 89.30%
Test Loss: 0.399, Test Acc: 86.52%


Well, let's apply the model onto our own data and see how good the results are. IMDB data is movie review sentiment, and when used to predict other kinds of sentiment, such as news sentiment in our case, results are quite different because the type of language used and its relationship to the inherent sentiment is different.

In [10]:
# Load test set
_99bitcoin_filepath = os.path.join('input', '99bitcoins', '99bitcoins_main.csv')
_99bitcoin_df = pd.read_csv(_99bitcoin_filepath)

In [18]:
# Predict, and round predictions
_99bitcoin_df['title_pred'] = _99bitcoin_df['event_title'].apply(lambda x: int(round(predict_sentiment(x))))
_99bitcoin_df['maintext_pred'] = _99bitcoin_df['event_maintext'].apply(lambda x: int(round(predict_sentiment(x))))
_99bitcoin_df.head()

[5539, 0, 9214, 0, 0, 47, 0, 0, 10, 20984, 0]




[2873, 8468, 0, 15296, 3, 0, 52, 99, 0]
[0, 0, 0, 3965, 0, 102, 0, 5925, 20984]
[0, 0, 0, 2873, 8468, 15296]
[0, 0, 0, 8, 1792, 6, 0, 0, 14279]
[2657, 0, 0, 0, 1885, 1523, 0, 24517, 0, 0]
[0, 0, 0, 28, 0, 23180]
[0, 0, 0, 0]
[2128, 0, 0, 944, 0, 0, 303, 0, 21, 17993]
[0, 0, 42, 23180, 0, 0]
[1673, 0, 0, 0, 0, 157, 144, 0]
[1996, 2452, 7, 2, 1801, 0, 17411, 52, 99, 0]
[0, 12396, 2873, 21259, 0, 47, 0, 12909, 5, 0, 0, 0, 5075, 19278, 4]
[0, 944, 677, 0, 0, 1488, 0, 9, 6162, 5866, 10748]
[2873, 21259, 15146, 8, 7569, 274, 0, 0]
[0, 5925, 1016, 42, 72, 518, 54, 4713, 1523, 0]
[0, 0, 0, 32, 0]
[0, 5925, 4200, 1523, 0, 23, 2, 117, 72]
[0, 0]
[0, 0, 8, 0, 0, 0]
[0, 0, 95, 0, 17931, 414, 709]
[0, 5925, 4200, 1523, 0, 23, 2, 117, 72]
[7586, 1576, 0, 10311, 608, 0, 5, 0, 0]
[8467, 0, 3, 706, 7, 0, 4241, 4418, 0, 20, 0]
[7586, 0, 13457, 47, 19046, 449, 199, 0]
[0, 14, 0, 14, 102, 0, 27, 0, 30, 5, 0, 0, 27, 0, 30]
[4473, 0, 0, 20, 0, 0]
[0, 0, 580, 0, 0, 0]
[0, 0, 0, 0]
[0, 5925, 4200, 1523, 21214

[4473, 0, 0, 20, 6, 15332, 13526, 7, 0, 4, 25, 1314, 15, 0, 4634, 0, 6, 3047, 3, 2232, 4338, 7, 15275, 3, 13, 1518, 0, 0, 791, 2696, 17, 449, 0, 202, 158, 17, 174, 17, 0, 5791, 3, 241, 121, 0, 0, 20, 6, 360, 7, 0, 0, 0, 4]
[25, 5539, 0, 5, 0, 0, 28, 0, 0, 23, 2, 580, 72, 10, 6, 0, 8002, 6, 0, 8, 1281, 8, 5602, 6, 117, 17, 7, 17, 127, 17, 360, 4756, 14867, 0, 3, 2, 10360, 0, 4, 25, 0, 23208, 10, 6, 0, 127, 4831, 0, 0, 0, 0, 15, 0, 0, 15296, 2, 2273, 8, 2050, 5, 9391, 2, 0, 0, 16802, 3, 43, 15296, 17, 0, 4756, 27, 0, 30, 13, 65, 9391, 46, 6, 4599, 5, 2794, 2, 10360, 0, 15, 5925, 4]
[2, 2128, 2762, 0, 2, 0, 7, 16450, 5, 9420, 0, 0, 2, 3307, 44, 520, 10356, 8, 36, 2121, 17, 0, 7, 0, 0, 8, 24156, 43, 15296, 17, 0, 0, 27, 0, 30, 8, 120, 12, 7747, 23, 0, 8, 1589, 0, 4]
[714, 0, 23, 116, 7, 2, 580, 492, 7, 0, 0, 0, 2, 1523, 21214, 3833, 23, 2, 117, 72, 10, 749, 217, 4, 0, 3930, 0, 1738, 10, 43, 0, 7, 331, 16687, 13, 3521, 105, 7041, 2, 5925, 85, 4748, 4]
[192, 6, 3448, 774, 7, 1262, 6244, 0, 0

[3547, 0, 3, 2, 0, 7, 2, 2483, 0, 15296, 0, 0, 3, 19, 9568, 10, 4473, 28, 21377, 7, 0, 5, 0, 10, 10977, 8, 0, 7, 2, 15296, 4, 0, 0, 3612, 0, 7, 0, 0, 9391, 18668, 5, 2, 1709, 545, 7, 0, 0, 3, 7, 76, 307, 34, 3298, 8, 2, 15296, 15, 0, 4, 0, 0, 0, 9, 283, 8, 34, 2378, 768, 0, 0, 7, 0, 0, 4]
[0, 0, 13674, 11470, 12546, 20703, 27, 0, 30, 5, 0, 19205, 27, 9674, 23804, 30, 0, 5862, 8, 8931, 0, 23, 86, 1709, 7829, 531, 86, 13672, 10510, 7, 2, 0, 17954, 0, 4, 12546, 19, 16870, 21, 14, 0, 0, 3, 20, 106, 20, 20602, 0, 3, 0, 7, 2762, 13547, 3, 449, 0, 3, 5, 4155, 7, 1019, 14, 10, 35, 316, 10, 0, 166, 1523, 0, 465, 7, 0, 47, 2, 0, 17954, 102, 35, 288, 1709, 19278, 4, 0, 19205, 9, 16870, 21, 449, 0, 5, 0, 7, 3127, 23, 8931, 1523, 0, 465, 7, 0, 10, 6, 1331, 2787, 4, 12546, 11985, 8, 1933, 1523, 0, 10, 0, 5, 19, 516, 0, 8, 2563, 217, 10, 2334, 10, 0, 0, 4, 0, 19205, 9, 23224, 0, 10, 0, 0, 4]
[0, 7, 834, 1413, 15518, 0, 7, 0, 0, 3, 0, 0, 3, 1138, 6, 431, 7, 0, 5791, 1695, 8, 0, 0, 5, 10360, 0, 0, 13, 

[714, 3334, 43, 16468, 0, 8, 0, 28, 0, 2563, 3, 15898, 13, 6, 0, 82, 21434, 86, 288, 1566, 17, 0, 0, 199, 2, 545, 7, 0, 0, 7150, 3, 0, 0, 15296, 0, 0, 15, 11595, 5, 0, 24712, 200, 11497, 320, 10123, 1641, 4, 3439, 0, 5, 0, 0, 4398, 6, 0, 6010, 0, 2, 0, 3, 0, 3, 5, 22049, 0, 0, 40, 2, 23304, 7, 2, 4473, 17, 783, 15296, 3, 157, 43, 20262, 0, 20412, 20864, 2195, 13, 159, 0, 0, 83, 768, 40, 2, 2384, 4]
[0, 0, 3, 0, 3, 5, 0, 17, 15005, 42, 5338, 6, 0, 7, 0, 1236, 8, 5866, 0, 7150, 13, 83, 1498, 10333, 41, 0, 0, 0, 10, 2, 0, 53, 0, 4, 0, 0, 0, 0, 117, 3, 28, 0, 2563, 3, 19863, 0, 8, 6, 6129, 6210, 10, 0, 5925, 141, 2, 0, 3015, 19, 0, 28, 0, 4875, 3, 0, 8, 6, 8419, 7, 1302, 41, 0, 0, 3, 12, 307, 33, 36, 1417, 13, 2, 0, 3015, 19, 10261, 4209, 23, 2, 1302, 45, 5338, 28, 0, 0, 4, 13907, 803, 14164]
[0, 127, 117, 0, 28, 0, 15, 0, 13091, 3, 2, 2526, 15, 0, 7, 7586, 0, 0, 0, 15, 1078, 0, 33, 8, 36, 6, 0, 4, 25, 19984, 1206, 0, 128, 10225, 20409, 8, 9391, 3, 0, 3, 50, 2322, 3160, 0, 5468, 8, 0, 4, 6

[0, 0, 25, 0, 0, 2798, 321, 4287, 22449, 5941, 0, 0, 3, 0, 19195, 23069, 15, 668, 28, 0, 9, 20, 13776, 20, 12, 9, 1958, 4, 750, 148, 107, 1607, 12, 3, 2, 5649, 17764, 13, 0, 52, 6, 11003, 2240, 13144, 38, 13, 15, 2017, 1834, 17, 4425, 595, 4, 1051, 6, 11099, 8, 0, 0, 10, 2, 6574, 3, 2, 23899, 844, 43, 11205, 0, 10, 5925, 13, 10617, 42, 1721, 20503, 3, 12282, 159, 1523, 0, 7767, 0, 54, 38, 2591, 157, 0, 4]
[1364, 15834, 0, 3, 0, 3, 0, 0, 2, 117, 15296, 8, 9391, 0, 5, 1223, 0, 0, 27, 0, 30, 4, 971, 908, 516, 3, 28, 15834, 0, 3, 0, 10575, 4727, 6, 6680, 23, 660, 17, 8, 17, 660, 15296, 10, 22447, 0, 27, 0, 30, 5, 5539, 0, 4, 1364, 10455, 1158, 3, 0, 1457, 0, 0, 10, 0, 27, 0, 30, 5, 109, 0, 4, 22527, 3, 45, 0, 0, 0, 5, 0, 23, 7486, 7, 7359, 7, 331, 16687, 5, 2, 5602, 9, 0, 0, 4]
[971, 142, 217, 220, 3, 0, 17985, 0, 21, 2, 5539, 0, 28, 2, 0, 0, 15296, 4, 25, 2232, 423, 3, 60, 1958, 3797, 0, 1419, 889, 28, 2, 17414, 0, 3, 10478, 193, 6, 0, 7, 1019, 10, 2, 4397, 0, 13, 2, 10123, 0, 11595, 9, 0

Unnamed: 0,event_id,event_title,event_date,bitcoin_value,bitcoin_value_10_days_later,event_maintext,title_label,maintext_label,title_pred,maintext_pred
0,91,U.S. regulator demands trading data from Bitco...,11/6/2018,7158.95,6709.39,The U.S. Commodity Futures Trading Commission ...,0,0,1,1
1,90,"South Korean crypto exchange, CoinRail has bee...",10/6/2018,7638.44,6747.77,South Korean crypto exchange Coinrail loses ov...,0,0,1,0
2,89,U.S.Justice Department launches criminal probe...,24/5/2018,7818.21,7608.5,The Justice Department has opened a criminal p...,0,0,0,0
3,88,Prosecutors raid largest South Korean exchange,11/5/2018,9289.09,8371.9,"Prosecutors raided UpBit, the largest cryptocu...",0,0,1,1
4,87,Goldman Sachs announces to open a Bitcoin trad...,2/5/2018,9021.75,8728.95,"One of the largest investment bank announced, ...",1,1,0,1


In [19]:
from sklearn import metrics

# Calculate precision and recall
title_cm = metrics.confusion_matrix(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred'])
print('Title Confusion Matrix')
print(pd.DataFrame(title_cm))
title_report = metrics.precision_recall_fscore_support(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred'], average='binary')
print ("\n title precision = %0.2f, title recall = %0.2f, title F1 = %0.2f, title accuracy = %0.2f\n" % 
           (title_report[0], title_report[1], title_report[2], 
            metrics.accuracy_score(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred'])))

maintext_cm = metrics.confusion_matrix(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred'])
print('Maintext Confusion Matrix')
print(pd.DataFrame(maintext_cm))
title_report = metrics.precision_recall_fscore_support(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred'], average='binary')
print ("\n maintext precision = %0.2f, maintext recall = %0.2f, maintext F1 = %0.2f, maintext accuracy = %0.2f\n" % 
           (title_report[0], title_report[1], title_report[2], 
            metrics.accuracy_score(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred'])))

Title Confusion Matrix
    0   1
0  25  20
1  34  12

 title precision = 0.38, title recall = 0.26, title F1 = 0.31, title accuracy = 0.41

Maintext Confusion Matrix
    0   1
0  16  35
1   8  32

 maintext precision = 0.48, maintext recall = 0.80, maintext F1 = 0.60, maintext accuracy = 0.53

