In [1]:
import random
import os
import pandas as pd

from sklearn import metrics

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

In [2]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [3]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

In [4]:
generate_bigrams('This is the worst basmati ever'.split(" "))

['This',
 'is',
 'the',
 'worst',
 'basmati',
 'ever',
 'worst basmati',
 'This is',
 'is the',
 'the worst',
 'basmati ever']

In [None]:
TEXT = data.Field(tokenize='spacy', preprocessing=generate_bigrams)
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

In [None]:
train, test = datasets.IMDB.splits(TEXT, LABEL)

train, valid = train.split(random_state=random.seed(SEED))

In [None]:
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [None]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

In [5]:
class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
                
        #embedded = [sent len, batch size, emb dim]
        
        embedded = embedded.permute(1, 0, 2)
        
        #embedded = [batch size, sent len, emb dim]
        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        #pooled = [batch size, embedding_dim]
                
        return self.fc(pooled)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

In [None]:
pretrained_embeddings  = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

In [None]:
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)
print(model)

In [6]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [7]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:

        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [8]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

In [20]:
def gensim_stoi(word):
    try:
        return googlenews_kv.vocab[word].index
    except KeyError:
        return 0

In [21]:
import spacy
nlp = spacy.load('en')

def predict_sentiment (sentence, keyedvector=False):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = []
    if keyedvector:
        indexed = [gensim_stoi(t) for t in tokenized]
    else:
        indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    print(indexed)
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model(tensor))
    return prediction.item()

In [None]:
predict_sentiment('U.S. regulator demands trading data from Bitcoin exchanges in manipulation probe')

In [None]:
predict_sentiment('Twitter announces ban on cryptocurrency ads')

In [10]:
modeldir = os.path.join('output', 'models')

trainset = 'IMDB'
modeltype = 'fasttext'

In [None]:
netstatename = trainset + '-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'

state = { 
    'epoch': epoch,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict()
}

torch.save(state, os.path.join(modeldir, 'full_state', netstatename))

In [None]:
%%time
epoch = 4
netstatename = trainset + '-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)
model.load_state_dict(torch.load(os.path.join(modeldir, 'full_state', netstatename))['state_dict'])
model.eval()

In [11]:
# Load test set
_99bitcoin_filepath = os.path.join('input', '99bitcoins', '99bitcoins_main.csv')
_99bitcoin_df = pd.read_csv(_99bitcoin_filepath)

In [None]:
# Predict, and round predictions
_99bitcoin_df['title_pred'] = _99bitcoin_df['event_title'].apply(lambda x: int(round(predict_sentiment(x))))
_99bitcoin_df['maintext_pred'] = _99bitcoin_df['event_maintext'].apply(lambda x: int(round(predict_sentiment(x))))
_99bitcoin_df.head()

In [None]:
# Calculate precision and recall
title_cm = metrics.confusion_matrix(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred'])
print('Title Confusion Matrix')
print(pd.DataFrame(title_cm))
title_report = metrics.precision_recall_fscore_support(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred'], average='binary')
print ("\n title precision = %0.2f, title recall = %0.2f, title F1 = %0.2f, title accuracy = %0.2f\n" % 
           (title_report[0], title_report[1], title_report[2], 
            metrics.accuracy_score(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred'])))

maintext_cm = metrics.confusion_matrix(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred'])
print('Maintext Confusion Matrix')
print(pd.DataFrame(maintext_cm))
title_report = metrics.precision_recall_fscore_support(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred'], average='binary')
print ("\n maintext precision = %0.2f, maintext recall = %0.2f, maintext F1 = %0.2f, maintext accuracy = %0.2f\n" % 
           (title_report[0], title_report[1], title_report[2], 
            metrics.accuracy_score(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred'])))

IMDB reviews aren't great at predicting the sentiment of news feeds. We should try to use the Google News word2vec embeddings, and some related labelled training set instead. I don't have a news-based sentiment-labelled data set now, so let's just try replacing the word embeddings and see if things improve.

In [12]:
%%time
import gensim
# keyed vectors
googlenews_kv = gensim.models.KeyedVectors.load_word2vec_format(os.path.join('input', 'word2vec', 'GoogleNews-vectors-negative300.bin'), binary=True)
weights = torch.FloatTensor(googlenews_kv.syn0)

  after removing the cwd from sys.path.


Wall time: 38.6 s


In [13]:
googlenews_kv.most_similar('crypto')

[('cryptographic', 0.5337051749229431),
 ('Crypto', 0.5129595994949341),
 ('encryption', 0.5039838552474976),
 ('encryption_algorithms', 0.49573421478271484),
 ('cryptography', 0.49152448773384094),
 ('AES_encryption', 0.48674529790878296),
 ('symmetric_encryption', 0.48474210500717163),
 ('cryptographic_algorithm', 0.4823398292064667),
 ('AES_CCMP', 0.4767687916755676),
 ('encryption_algorithm', 0.47422072291374207)]

In [14]:
googlenews_kv.vocab['crypto'].index

122230

In [None]:
# Rebuild the model to accommodate the new dimensionality of the Google News word2vec dictionary
INPUT_DIM = len(weights)
EMBEDDING_DIM = 300
OUTPUT_DIM = 1

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

In [None]:
# Copy the Google News word2vec weights into the model
# googlenews_pretrained_embeddings = nn.Embedding.from_pretrained(weights)
model.embedding.weight.data.copy_(weights)

In [None]:
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)
print(model)

In [None]:
# Train the model with new embeddings
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    state = { 
    'epoch': epoch,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict()
    }
    netstatename = trainset + '-googlenewsembeddings-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'
    torch.save(state, os.path.join(modeldir, 'full_state', netstatename))
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

In [17]:
%%time
# Load the serialized model
INPUT_DIM = 3000000
EMBEDDING_DIM = 300
OUTPUT_DIM = 1

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

epoch = 3
googlenews_netstatename = trainset + '-googlenewsembeddings-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)
model.load_state_dict(torch.load(os.path.join(modeldir, 'full_state', googlenews_netstatename))['state_dict'])

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)
print(model)

model.eval()

FastText(
  (embedding): Embedding(3000000, 300)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)
Wall time: 38.3 s


In [None]:
from torchtext.vocab import Vectors
model_name = 'GoogleNews-vectors-negative300.bin'
word2vec_path = os.path.join('input', 'word2vec')
vectors = Vectors(name=model_name, cache=word2vec_path) # model_name + path = path_to_embeddings_file

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

In [22]:
predict_sentiment('U.S. regulator demands trading data from Bitcoin exchanges in manipulation probe', keyedvector=True)

[79, 8104, 2839, 832, 440, 17, 1021827, 6286, 1, 14292, 4030]




0.958014726638794

In [23]:
predict_sentiment('Twitter announces ban on cryptocurrency ads', keyedvector=True)

[3320, 5513, 1839, 5, 0, 3465]




4.441506462171674e-06

In [25]:
_99bitcoin_df['title_pred_googlenewsembeddings'] = _99bitcoin_df['event_title'].apply(lambda x: int(round(predict_sentiment(x, keyedvector=True))))
_99bitcoin_df['maintext_pred_googlenewsembeddings'] = _99bitcoin_df['event_maintext'].apply(lambda x: int(round(predict_sentiment(x, keyedvector=True))))
_99bitcoin_df.head()

[79, 8104, 2839, 832, 440, 17, 1021827, 6286, 1, 14292, 4030]




[564, 4100, 122230, 1367, 0, 0, 24, 42, 18227]
[0, 423, 6695, 1739, 4030, 69, 1021827, 422, 14292]
[4529, 5015, 578, 564, 4100, 1367]
[7820, 34654, 5513, 0, 341, 0, 1021827, 832, 1342]
[1106, 5507, 0, 11863, 149, 34, 0, 7619, 1021827, 26849]
[3320, 5513, 1839, 5, 0, 3465]
[1641, 9527, 122230, 10310]
[2448, 55614, 2582, 115, 122230, 6286, 336, 2631, 8, 840]
[2538, 9527, 52, 3465, 3880, 0]
[1759, 0, 5507, 30680, 19342, 55, 120, 18227]
[0, 146, 0, 11, 531, 1021827, 1465, 24, 42, 21262]
[0, 6114, 564, 4452, 435, 17, 0, 1033, 0, 8104, 43141, 0, 761, 2160, 0]
[3170, 115, 282, 2538, 3609, 1918, 50052, 4, 1717, 1877, 3237]
[564, 4452, 8415, 0, 2233, 119, 0, 6286]
[1021827, 422, 271, 52, 59, 144, 76, 988, 34, 0]
[48948, 1021827, 14604, 19, 1332]
[1021827, 422, 3853, 34, 0, 2, 11, 56, 59]
[0, 94130]
[24105, 5513, 0, 1319, 1021827, 3247]
[1021827, 0, 0, 1021827, 3011, 1047, 588]
[1021827, 422, 3853, 34, 0, 2, 11, 56, 59]
[367, 1732, 85772, 6644, 427, 1021827, 0, 0, 53768]
[6100, 49284, 0, 374, 0,

[1035, 892, 21, 1679, 2364, 0, 191, 0, 6286, 0, 10974, 832, 0, 1045, 6946, 960, 0, 30, 4778, 0, 11427, 0, 10835, 6580, 18, 892, 5, 11, 313, 0, 3143, 302, 1546, 0, 53768, 37, 53, 162, 0, 677, 1527, 65, 1928, 17835, 0, 291, 0, 0, 121, 2432, 955, 18, 11, 2364, 219, 229, 1, 874, 0, 8854, 2973, 1916, 1546, 3, 37, 13628, 590, 0, 11373, 18, 0, 121, 1178, 0, 1524, 0]
[13651, 5677, 287, 1729, 1562, 4774, 3198, 6100, 49284, 9, 22, 47, 544, 101, 2019, 832, 0, 2, 120, 0, 6762, 0, 0, 7, 0, 0, 80409, 0, 178, 112, 0, 0, 22, 162, 27, 3609, 566, 1, 442, 5353, 0, 8646, 15, 23, 1702, 3252, 41, 0, 0, 51, 0, 0, 2417, 0, 0, 0, 2058, 60, 93062, 19781, 0, 0]
[19768, 443, 0, 25271, 367, 0, 1720, 12968, 4323, 14, 340, 18, 0, 290, 9298, 0, 445467, 343, 3, 0, 2432, 0, 1044, 18, 0, 345, 956, 3, 7732, 629, 1, 11, 198, 0, 2973, 1916, 938, 0, 9, 65, 881, 1989, 812, 29, 66, 4640, 5570, 108, 0, 19, 3063, 0, 51, 264, 3, 892, 19, 11393, 119, 5, 734, 9935, 2240, 0, 7, 2630, 3716, 1720, 12968, 4323, 0, 0, 0, 14, 27, 10168,

[2799, 6974, 0, 2483, 0, 0, 991481, 0, 1377, 27, 1581, 1, 157856, 2504, 3506, 3, 27, 1300, 251, 945, 1084, 3531, 3333, 3103, 10, 799, 188550, 192671, 29, 0, 0, 5269, 239663, 0, 0, 0, 0, 2841, 0, 246, 0, 3103, 0, 13167, 3174, 4091, 0, 8507, 9451, 0, 0, 19098, 3, 1617, 0, 2808, 3103, 4, 1021827, 0, 9480, 0, 0, 70, 45, 8507, 13702, 3103, 806, 1059, 0, 20, 92, 126, 201, 0, 627, 0, 5784, 11, 593, 3, 20, 0, 42, 587, 0, 140, 0, 0, 0, 1439, 2630, 2967, 3, 3103, 35, 654, 0, 0, 1021827, 2071, 1069, 0, 81, 0, 0, 499, 0, 2561, 0, 11, 8507, 1629, 0, 1479, 0, 10544, 37, 1377, 0, 3103, 92, 13, 380, 157856, 8, 101, 6279, 3, 22, 10, 3857, 188550, 192671, 0, 0, 6091, 507, 0, 11, 1581, 120, 1377, 0, 3103, 0, 1259, 590, 1777, 10, 1766, 2076, 17, 11, 1985, 0]
[7, 123616, 956, 2253, 11, 1021827, 2325, 6035, 0, 305603, 2146, 8, 200, 9758, 5023, 125, 108, 15, 0, 33, 89, 5357, 12, 11, 213, 0, 1583, 0, 0, 16, 1, 0, 352, 1405, 0, 11, 123616, 1558, 0, 0, 7, 316598, 23, 16, 482, 11, 6011, 0, 0, 0, 113522, 0, 0, 17

[35790, 0, 0, 3740, 0, 71761, 0, 5513, 3, 15, 4, 10546, 8, 0, 0, 2013, 1021827, 2149, 5, 30, 1699, 0, 0, 1906, 11, 147, 80, 245, 0, 11, 54, 285, 23, 141, 5, 10877, 1021827, 2314, 3092, 2, 35790, 9006, 0, 0, 7, 1021827, 2314, 1494, 23, 16, 15546, 5378, 2, 52, 9006, 5, 11, 1699, 0, 0, 35790, 9006, 199, 89, 962, 41, 2, 0, 0, 1201, 0, 2137, 15, 0, 30, 35790, 1201, 0, 7, 303, 2701, 0, 3612, 3, 71761, 4, 1724, 39, 5245, 1716, 1021827, 0, 0, 137, 1702, 7002, 15, 69, 30, 250, 2314, 3092, 273, 0]
[13194, 740, 5619, 5513, 5, 3320, 3, 801464, 97, 10732, 1021827, 0, 0, 6794, 1, 11, 1217, 5729, 0, 89, 0, 50, 1299, 101, 684, 1964, 5, 5619, 0, 590, 5194, 527, 1021827, 0, 427, 1021827, 3753, 19, 0, 16, 4402, 18, 0, 0, 0, 1021827, 2314, 8055, 0, 0, 332, 0, 9139, 718, 0, 34, 0, 214, 0, 5619, 2540, 11, 578, 100, 0, 2013, 1021827, 0, 105, 1744, 3120, 0, 246, 0, 66, 338, 590, 3013, 0, 6109, 0, 1021827, 2149, 1, 0, 0, 93461, 5, 0, 0, 0, 0, 0, 1634, 8055, 0, 0, 6132, 9381, 5, 0, 0, 0, 0, 0, 0, 0, 127576, 5, 

[305, 254932, 0, 1046, 27, 259, 761, 1201, 8, 5734, 12830, 0, 446, 0, 1010, 1258, 0, 4141, 0, 86, 0, 0, 11, 992, 0, 0, 847, 43, 354, 1, 29, 1367, 2325, 2, 122, 1270, 0, 0, 0, 0, 4875, 122, 160, 2013, 918, 17, 484, 0, 1257, 11, 918, 191, 5, 0, 6365, 0, 5952, 183954, 0, 0, 0, 7, 79, 1104, 2752, 2357, 0, 316, 11, 4872, 955, 0, 42480, 2799, 22157, 48191, 34, 0, 17, 27, 1201, 1361, 18, 0, 254932, 0, 3740, 3, 10, 233, 0, 472, 2149, 0, 0, 17, 79, 484, 0, 0, 11, 352, 0, 1021827, 0, 885, 1873, 2540, 518, 39, 5624, 0]
[12489, 572, 0, 16, 0, 18804, 54880, 0, 0, 0, 1673, 0, 139689, 0, 700, 5, 11, 578, 0, 1367, 0, 11, 267, 11685, 0, 4183, 5, 11, 6989, 0, 9719, 0, 4818, 1744, 64702, 254932, 0, 0, 5984, 0, 2733, 5448, 0, 53187, 0, 3842, 0, 95812, 1093, 46, 11, 1367, 0, 35755, 3390, 0, 1793, 6724, 0, 983, 3, 181449, 11, 175, 0, 4012, 435, 119, 0, 1600, 0, 1908, 934, 0, 99, 1545, 333, 0, 278, 255, 334, 0]
[388953, 18, 9719, 446, 311462, 869402, 0, 11, 92018, 0, 11, 802, 1380, 0, 11, 802, 1116, 669, 0, 

[442, 5151, 6230, 3466, 0, 277, 0, 753, 0, 983, 0, 0, 8, 27, 1720, 1367, 511, 0, 0, 47654, 0, 45, 79, 9795, 0, 29, 46, 439, 46782, 0, 0, 6297, 279, 0, 0, 7, 511, 4, 8782, 17, 11, 370, 0, 2460, 233, 18, 0, 1279, 0, 3219, 0, 29, 0, 2747, 0, 11, 2325, 0]
[7, 56, 1021827, 2771, 342, 0, 29, 38869, 1484, 0, 5912, 104, 11, 1021827, 0, 0, 730, 0, 3801, 0, 0, 0, 18280, 1355, 5228, 0, 3, 113, 0, 7, 1371, 0, 11651, 5, 10132, 0, 110, 4818, 2, 1030, 0, 7, 1720, 1021827, 241, 0, 44, 1178, 2735, 19, 582, 18, 188550, 192671, 222, 255, 334, 0]


Unnamed: 0,event_id,event_title,event_date,bitcoin_value,bitcoin_value_10_days_later,event_maintext,title_label,maintext_label,title_pred_googlenewsembeddings,maintext_pred_googlenewsembeddings
0,91,U.S. regulator demands trading data from Bitco...,11/6/2018,7158.95,6709.39,The U.S. Commodity Futures Trading Commission ...,0,0,1,1
1,90,"South Korean crypto exchange, CoinRail has bee...",10/6/2018,7638.44,6747.77,South Korean crypto exchange Coinrail loses ov...,0,0,1,1
2,89,U.S.Justice Department launches criminal probe...,24/5/2018,7818.21,7608.5,The Justice Department has opened a criminal p...,0,0,1,0
3,88,Prosecutors raid largest South Korean exchange,11/5/2018,9289.09,8371.9,"Prosecutors raided UpBit, the largest cryptocu...",0,0,1,1
4,87,Goldman Sachs announces to open a Bitcoin trad...,2/5/2018,9021.75,8728.95,"One of the largest investment bank announced, ...",1,1,1,1


In [26]:
# Calculate precision and recall
title_cm = metrics.confusion_matrix(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred_googlenewsembeddings'])
print('Title Confusion Matrix (Google News Word Embeddings)')
print(pd.DataFrame(title_cm))
title_report = metrics.precision_recall_fscore_support(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred_googlenewsembeddings'], average='binary')
print ("\n title precision = %0.2f, title recall = %0.2f, title F1 = %0.2f, title accuracy = %0.2f\n" % 
           (title_report[0], title_report[1], title_report[2], 
            metrics.accuracy_score(_99bitcoin_df['title_label'], _99bitcoin_df['title_pred_googlenewsembeddings'])))

maintext_cm = metrics.confusion_matrix(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred_googlenewsembeddings'])
print('Maintext Confusion Matrix (Google News Word Embeddings)')
print(pd.DataFrame(maintext_cm))
title_report = metrics.precision_recall_fscore_support(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred_googlenewsembeddings'], average='binary')
print ("\n maintext precision = %0.2f, maintext recall = %0.2f, maintext F1 = %0.2f, maintext accuracy = %0.2f\n" % 
           (title_report[0], title_report[1], title_report[2], 
            metrics.accuracy_score(_99bitcoin_df['maintext_label'], _99bitcoin_df['maintext_pred_googlenewsembeddings'])))

Title Confusion Matrix (Google News Word Embeddings)
    0   1
0  20  25
1  24  22

 title precision = 0.47, title recall = 0.48, title F1 = 0.47, title accuracy = 0.46

Maintext Confusion Matrix (Google News Word Embeddings)
    0   1
0  26  25
1  25  15

 maintext precision = 0.38, maintext recall = 0.38, maintext F1 = 0.38, maintext accuracy = 0.45

