In [1]:
import random
import os
import pandas as pd

from sklearn import metrics

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

In [2]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [3]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

In [4]:
generate_bigrams('This is the worst basmati ever'.split(" "))

['This',
 'is',
 'the',
 'worst',
 'basmati',
 'ever',
 'This is',
 'the worst',
 'is the',
 'worst basmati',
 'basmati ever']

In [5]:
TEXT = data.Field(tokenize='spacy', preprocessing=generate_bigrams)
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

In [6]:
train, test = datasets.IMDB.splits(TEXT, LABEL)

train, valid = train.split(random_state=random.seed(SEED))

In [7]:
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [8]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

In [9]:
class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
                
        #embedded = [sent len, batch size, emb dim]
        
        embedded = embedded.permute(1, 0, 2)
        
        #embedded = [batch size, sent len, emb dim]
        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        #pooled = [batch size, embedding_dim]
                
        return self.fc(pooled)

In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

In [11]:
pretrained_embeddings  = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.3739,  0.3016,  0.5479,  ..., -0.1662,  0.9332,  0.4808],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [12]:
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)
print(model)

FastText(
  (embedding): Embedding(25002, 100)
  (fc): Linear(in_features=100, out_features=1, bias=True)
)


In [13]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [14]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:

        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

In [18]:
def gensim_stoi(word):
    try:
        return googlenews_kv.vocab[word].index
    except KeyError:
        return 0

In [19]:
import spacy
nlp = spacy.load('en')

def predict_sentiment (sentence, keyedvector=False):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = []
    if keyedvector:
        indexed = [gensim_stoi(t) for t in tokenized]
    else:
        indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    print(indexed)
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model(tensor))
    return prediction.item()

In [20]:
predict_sentiment('U.S. regulator demands trading data from Bitcoin exchanges in manipulation probe')

[5539, 0, 9214, 0, 0, 47, 0, 0, 10, 20984, 0]




0.9979501366615295

In [21]:
predict_sentiment('Twitter announces ban on cryptocurrency ads')

[0, 0, 0, 28, 0, 23180]




0.00010003013449022546

In [16]:
modeldir = os.path.join('output', 'models')

trainset = 'IMDB'
modeltype = 'fasttext'

In [None]:
netstatename = trainset + '-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'

state = { 
    'epoch': epoch,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict()
}

torch.save(state, os.path.join(modeldir, 'full_state', netstatename))

In [17]:
%%time
epoch = 4
netstatename = trainset + '-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)
model.load_state_dict(torch.load(os.path.join(modeldir, 'full_state', netstatename))['state_dict'])
model.eval()

Wall time: 44 ms


In [None]:
# Load test set
_99bitcoin_filepath = os.path.join('input', '99bitcoins', '99bitcoins_main.csv')
_99bitcoin_df = pd.read_csv(_99bitcoin_filepath)

In [None]:
# Predict, and round predictions
_99bitcoin_df['title_pred'] = _99bitcoin_df['event_title'].apply(lambda x: int(round(predict_sentiment(x))))
_99bitcoin_df['maintext_pred'] = _99bitcoin_df['event_maintext'].apply(lambda x: int(round(predict_sentiment(x))))
_99bitcoin_df.head()

In [None]:
# Calculate precision and recall
title_cm = metrics.confusion_matrix(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred'])
print('Title Confusion Matrix')
print(pd.DataFrame(title_cm))
title_report = metrics.precision_recall_fscore_support(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred'], average='binary')
print ("\n title precision = %0.2f, title recall = %0.2f, title F1 = %0.2f, title accuracy = %0.2f\n" % 
           (title_report[0], title_report[1], title_report[2], 
            metrics.accuracy_score(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred'])))

maintext_cm = metrics.confusion_matrix(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred'])
print('Maintext Confusion Matrix')
print(pd.DataFrame(maintext_cm))
title_report = metrics.precision_recall_fscore_support(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred'], average='binary')
print ("\n maintext precision = %0.2f, maintext recall = %0.2f, maintext F1 = %0.2f, maintext accuracy = %0.2f\n" % 
           (title_report[0], title_report[1], title_report[2], 
            metrics.accuracy_score(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred'])))

IMDB reviews aren't great at predicting the sentiment of news feeds. We should try to use the Google News word2vec embeddings, and some related labelled training set instead. I don't have a news-based sentiment-labelled data set now, so let's just try replacing the word embeddings and see if things improve.

In [None]:
%%time
import gensim
# keyed vectors
googlenews_kv = gensim.models.KeyedVectors.load_word2vec_format(os.path.join('input', 'word2vec', 'GoogleNews-vectors-negative300.bin'), binary=True)
weights = torch.FloatTensor(googlenews_kv.syn0)

In [None]:
googlenews_kv.most_similar('crypto')

In [None]:
googlenews_kv.vocab['crypto'].index

In [None]:
# Rebuild the model to accommodate the new dimensionality of the Google News word2vec dictionary
INPUT_DIM = len(weights)
EMBEDDING_DIM = 300
OUTPUT_DIM = 1

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

In [None]:
# Copy the Google News word2vec weights into the model
# googlenews_pretrained_embeddings = nn.Embedding.from_pretrained(weights)
model.embedding.weight.data.copy_(weights)

In [None]:
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)
print(model)

In [None]:
# Train the model with new embeddings
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    state = { 
    'epoch': epoch,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict()
    }
    netstatename = trainset + '-googlenewsembeddings-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'
    torch.save(state, os.path.join(modeldir, 'full_state', netstatename))
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

In [None]:
%%time
# Load the serialized model
INPUT_DIM = 3000000
EMBEDDING_DIM = 300
OUTPUT_DIM = 1

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

epoch = 3
googlenews_netstatename = trainset + '-googlenewsembeddings-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)
model.load_state_dict(torch.load(os.path.join(modeldir, 'full_state', googlenews_netstatename))['state_dict'])

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)
print(model)

model.eval()

In [None]:
model.embedding.num_embeddings

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

In [None]:
predict_sentiment('U.S. regulator demands trading data from Bitcoin exchanges in manipulation probe', keyedvector=True)

In [None]:
predict_sentiment('Twitter announces ban on cryptocurrency ads', keyedvector=True)

In [None]:
_99bitcoin_df['title_pred_googlenewsembeddings'] = _99bitcoin_df['event_title'].apply(lambda x: int(round(predict_sentiment(x, keyedvector=True))))
_99bitcoin_df['maintext_pred_googlenewsembeddings'] = _99bitcoin_df['event_maintext'].apply(lambda x: int(round(predict_sentiment(x, keyedvector=True))))
_99bitcoin_df.head()

In [None]:
# Calculate precision and recall
title_cm = metrics.confusion_matrix(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred_googlenewsembeddings'])
print('Title Confusion Matrix (Google News Word Embeddings)')
print(pd.DataFrame(title_cm))
title_report = metrics.precision_recall_fscore_support(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred_googlenewsembeddings'], average='binary')
print ("\n title precision = %0.2f, title recall = %0.2f, title F1 = %0.2f, title accuracy = %0.2f\n" % 
           (title_report[0], title_report[1], title_report[2], 
            metrics.accuracy_score(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred_googlenewsembeddings'])))

maintext_cm = metrics.confusion_matrix(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred_googlenewsembeddings'])
print('Maintext Confusion Matrix (Google News Word Embeddings)')
print(pd.DataFrame(maintext_cm))
title_report = metrics.precision_recall_fscore_support(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred_googlenewsembeddings'], average='binary')
print ("\n maintext precision = %0.2f, maintext recall = %0.2f, maintext F1 = %0.2f, maintext accuracy = %0.2f\n" % 
           (title_report[0], title_report[1], title_report[2], 
            metrics.accuracy_score(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred_googlenewsembeddings'])))