# Sentiment prediction using Tensorflow 2.0

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
import csv
import numpy as np
from tokenizer_class import FullTokenizer, is_number
import multiprocessing as mp
import pickle
from sklearn import preprocessing
from sklearn import metrics

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model

pd.set_option('max_colwidth', 100)
%load_ext autoreload
%autoreload 2

### Word Embeddings
Load the pre-trained GloVe 300 dimensional embedding matrix
Obtained here: https://nlp.stanford.edu/projects/glove/

In [2]:
embedding_file = 'data/glove.6B.300d.txt'

df_embedding = pd.read_csv(embedding_file, sep=' ', index_col=0,
                           header=None, na_values=None,
                           keep_default_na=False,
                           quoting=csv.QUOTE_NONE)
df_embedding.sort_index(inplace=True)

In [3]:
df_embedding.shape

(400000, 300)

In [4]:
df_embedding.sample(10)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sonar,0.33048,0.11685,-0.41126,0.18848,0.32109,-0.81396,-0.051148,-0.33745,-0.20478,-1.1983,...,-0.8584,0.25297,-0.53927,-0.59504,-0.24827,-1.113,0.42782,-0.236,0.37596,-0.427
hearted,-0.20698,-0.40483,-0.26751,-0.59133,-0.064752,0.043845,0.084845,-0.10565,0.011034,-0.076297,...,-0.04176,0.23887,-0.11044,0.23891,0.28237,-0.65284,-0.5831,0.072921,-0.067318,0.51894
dosage,-0.65587,0.82761,-0.055756,0.35771,0.39454,-0.12571,-0.3145,-0.19743,-0.25088,-0.71029,...,-0.33271,0.17184,0.22917,-0.5064,0.22394,-0.71144,-0.51024,-0.29889,-0.090484,-0.32273
lightyears,-0.22943,-0.68283,-0.28061,0.48047,-0.13519,-0.37987,-0.17225,-0.11517,0.55226,0.83984,...,0.39503,0.39518,0.2647,-0.000818,0.009533,-0.87833,-0.37824,0.26746,-0.076411,-0.030604
ragni,-0.13872,0.16107,0.52534,0.042151,-0.4363,-0.48483,0.13435,0.033304,0.36031,0.76894,...,0.46064,0.33886,0.12459,-0.13197,-0.1064,-0.60126,0.25891,1.0986,0.43276,-0.30261
poshteh-ye,-0.56235,0.084654,-0.066012,0.23186,-0.147,-0.51167,0.57268,0.35082,-0.72306,1.0592,...,0.047565,0.55736,0.00669,0.37109,0.26826,-0.045595,-0.53871,0.076358,0.27956,-0.60725
mdivani,0.36266,-0.045557,-0.03423,0.15777,-0.65332,-0.47883,0.23488,-0.12939,0.16024,0.98183,...,-0.099246,0.012156,0.44563,0.029854,0.23729,-0.55347,0.21815,0.68205,0.038624,-0.34259
harriers,0.38212,0.66555,0.36603,0.11827,0.14551,0.10476,0.53845,0.45835,0.065454,0.87427,...,-1.5326,0.97506,-0.19627,-0.61126,-0.78558,-0.45528,0.37727,-0.64013,-0.90012,0.57921
kolah,-0.25497,0.078224,-0.31499,-0.011261,0.041182,-0.24037,-0.041194,0.51986,-0.7784,0.57429,...,-0.34147,0.62657,0.012646,-0.44971,0.1667,-1.0036,-0.15172,0.19794,-0.014791,0.42345
11-week,0.2513,0.84219,0.39661,0.27576,0.087887,0.73749,0.12545,-0.84793,0.40153,0.81314,...,0.18381,0.69554,-0.11428,0.33645,0.027179,0.36147,0.43794,-0.43357,-0.54026,0.42418


#### Cosine Similarity
$\Large \text{sim(U, V)} = \frac {U^T\cdot{V}} {||U||_2 ||V||_2} = cos(\theta)$

<img src="img/cosine_sim.png">

In [5]:
def cosine_similarity(u, v):
    cs = np.dot(u, v) / (np.linalg.norm(u, ord=2) * np.linalg.norm(v, ord=2))
    return cs

In [6]:
cosine_similarity(df_embedding.loc['france'], df_embedding.loc['italy'])

0.5643459466769057

In [7]:
cosine_similarity(df_embedding.loc['ball'], df_embedding.loc['crocodile'])

0.025038104793071447

In [8]:
cosine_similarity(df_embedding.loc['france'] - df_embedding.loc['paris'],
                  df_embedding.loc['rome'] - df_embedding.loc['italy'])


-0.6228158930510513

In [9]:
with open('data/google-10000-english.txt', 'r') as f:
    vocab = f.read().splitlines()
df_embedding_small = df_embedding.reindex(vocab)

In [10]:
def word_analogy(a, b, c):
    emb_a, emb_b, emb_c = df_embedding_small.loc[a], \
                          df_embedding_small.loc[b], \
                          df_embedding_small.loc[c]
    max_sim = -100
    result = None
    for w, emb_w in df_embedding_small.iterrows():
        if w in [a, b, c]:
            continue
        sim = cosine_similarity(emb_b - emb_a, emb_w - emb_c)
        if sim > max_sim:
            max_sim = sim
            result = w
    return result

In [11]:
test_cases = [
    ('paris', 'france', 'london'),
    ('paris', 'france', 'rome'),
    ('paris', 'france', 'madrid'),
    ('paris', 'france', 'berlin'),
    ('man', 'king', 'woman'),
    ('walk', 'walked', 'go'),
    ('man', 'woman', 'boy'),
    ('apple', 'juice', 'mother')
]

for t in test_cases:
    r = word_analogy(t[0], t[1], t[2])
    print('{} -> {} : {} -> {}'.format(t[0], t[1], t[2], r))

paris -> france : london -> britain
paris -> france : rome -> italy
paris -> france : madrid -> spain
paris -> france : berlin -> germany
man -> king : woman -> queen
walk -> walked : go -> went
man -> woman : boy -> girl
apple -> juice : mother -> lemon


#### Load data

In [12]:
news_data = pd.read_csv('data/news_data.csv',
                        usecols = ['TEXT', 'SENTIMENT'])
news_data.drop_duplicates(inplace=True)

In [13]:
news_data.tail()

Unnamed: 0,SENTIMENT,TEXT
611430,0.94,Commonwealth Bank of Australia Upgraded to Buy from Hold by Bell Potter
611431,-0.58,Qbe Insurance Downgraded to Hold from Buy by Bell Potter
611432,0.49,Collective & Roland Team for
611433,-0.58,Mcmillan Shakespeare Downgraded to Neutral from Outperform by Macquarie
611434,0.95,Nearmap Initiated at Outperform by Macquarie


#### Initialize the tokenizer and compress the embedding matrix

Build out a set of all tokens from every `TEXT` record using the full tokenizer (with its vocab initialised from the embedding matrix), then filter the embedding matrix dictionary to only include those tokens that exist in the `TEXT`. Then, re-instantiate the tokenizer using the embedding matrix dictionary. This speeds up training for testing/POC purpose by minimizing the size of the embedding tensor.

In [14]:
# load saved tokenizer?
if True:
    with open('model/tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)
    df_embedding = df_embedding.reindex(tokenizer.vocab.keys())
    df_embedding.sort_index(inplace=True)
    df_embedding.loc['<START>'] = np.zeros(df_embedding.shape[1])
    df_embedding.loc['<UNK>'] = df_embedding.mean()
    df_embedding.loc['<NUMBER>'] = np.ones(df_embedding.shape[1])/2
    df_embedding.loc['<PAD>'] = np.ones(df_embedding.shape[1])
    
else:
    tokenizer = FullTokenizer(df_embedding.index.values)

    def process(arr):
        for i in map(tokenizer.tokenize, arr):
            tokens.update(i)
        return tokens

    tokens = set()
    if __name__ == '__main__':
        p = mp.Pool(processes=mp.cpu_count())
        split_arr = np.array_split(news_data.TEXT.values, mp.cpu_count())
        pool_results = p.map(process, split_arr)
        p.close()
        p.join()

        all_tokens = set.union(*pool_results)
    
    all_tokens = set([t for t in all_tokens if not is_number(t)])
    df_embedding = df_embedding.reindex(all_tokens).dropna()
    df_embedding.sort_index(inplace=True)
    df_embedding.loc['<START>'] = np.zeros(df_embedding.shape[1])
    df_embedding.loc['<UNK>'] = df_embedding.mean()
    df_embedding.loc['<NUMBER>'] = np.ones(df_embedding.shape[1])/2
    df_embedding.loc['<PAD>'] = np.ones(df_embedding.shape[1])

tokenizer = FullTokenizer(df_embedding.index.values)

Tokenization example

In [15]:
test_sentence = 'Tokenizer takes a string and breaks it down into tokens.'
tokenizer.tokenize(test_sentence)

['tokenizer',
 'takes',
 'a',
 'string',
 'and',
 'breaks',
 'it',
 'down',
 'into',
 'tokens',
 '.']

In [16]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_sentence))

[1241, 56549, 1252, 55330, 3529, 8641, 29211, 17049, 28807, 58228, 19]

Visualise tokens in the word embeddings matrix

In [17]:
df_embedding.iloc[tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_sentence))]

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
<UNK>,0.056789,-0.022786,-0.002645,0.069256,0.010156,-0.036345,0.032409,-0.064922,-0.041482,0.234542,...,0.020075,0.102891,0.090614,-0.024519,-0.07071,-0.191057,0.048234,0.136562,0.08845,-0.031001
takes,-0.26548,-0.10947,0.17989,-0.077156,-0.001871,-0.027737,-0.1809,-0.14575,0.34182,-1.3149,...,-0.27879,0.1281,-0.21373,-0.1484,-0.12381,0.011745,-0.33166,0.31962,-0.32352,0.002213
a,-0.29712,0.094049,-0.096662,-0.344,-0.18483,-0.12329,-0.11656,-0.099692,0.17265,-1.6386,...,0.075972,-0.42426,-0.3967,0.32683,0.62049,0.34719,0.26952,0.059717,-0.22853,0.29602
string,0.49079,0.37355,-0.108,-0.18194,-0.40668,-0.21493,-0.20055,-0.46924,-0.12031,-0.92476,...,0.16177,-0.33249,-0.31958,0.34706,0.11696,0.87068,0.2647,-0.54394,-0.36843,-0.22576
and,0.038466,-0.039792,0.082747,-0.38923,-0.21431,0.1702,-0.025657,0.09578,0.2386,-1.6342,...,0.045194,-0.20405,-0.21097,-0.11025,0.021766,0.44129,0.32797,-0.33427,0.011807,0.059703
breaks,-0.27356,0.012609,-0.44131,0.41077,0.13485,0.14479,0.19091,-0.25705,0.44246,-0.9348,...,-0.30463,0.051168,-0.79409,0.12968,-0.29105,-0.21935,-0.60928,-0.054624,0.38992,0.75237
it,0.033284,-0.040754,-0.048377,0.12017,-0.13915,-0.17694,-0.062908,0.17056,0.20077,-2.4287,...,0.091222,-0.402,0.1543,0.23099,0.086138,-0.002428,0.065196,-0.15408,0.17806,-0.19683
down,-0.081429,-0.11004,-0.031034,0.60457,0.067606,-0.31609,-0.46059,-0.20273,0.46852,-1.6009,...,0.21293,-0.1729,0.01338,0.45064,0.12472,0.54997,-0.19647,-0.39002,-0.062974,0.096852
into,-0.18234,-0.19209,0.15167,-0.048763,0.28354,-0.025568,-0.39882,-0.014072,0.41029,-1.987,...,-0.058528,-0.13367,0.15902,0.072976,0.77943,0.7924,0.073351,-0.24839,0.14468,0.17683
tokens,-0.67142,-0.013502,0.078301,0.009528,-0.039577,0.36174,0.37012,0.12418,0.44907,0.13781,...,-0.51993,-0.11976,-0.075887,0.34228,0.24876,-0.27156,-0.1269,-0.20267,0.23491,-0.90889


#### Define encoding/decoding methods
The `encode` method takes a string and converts it to IDs using the tokenizer dictionary. `decode` performs the opposite action.

In [18]:
def encode(string):
    encoding = ["<START>"] + tokenizer.tokenize(string)
    encoding = tokenizer.convert_tokens_to_ids(encoding)
    return encoding

def decode(ids):
    tokens = tokenizer.convert_ids_to_tokens(ids)
    return tokens

#### Process the dataset
We encode `TEXT` using the tokenizer to obtain IDs that correspond to each token's index within the embedding matrix. We use multiprocessing to process large datasets.

In [19]:
def process(df):
    res = df.TEXT.apply(encode)
    return res

if __name__ == '__main__':
    p = mp.Pool(processes=mp.cpu_count())
    split_dfs = np.array_split(news_data, mp.cpu_count())
    pool_results = p.map(process, split_dfs)
    p.close()
    p.join()

    # merging parts processed by different processes
    dataset = pd.concat(pool_results, axis=0)

Preview the dataset:

In [20]:
dataset.tail()

611430    [1240, 12768, 5885, 41528, 5144, 60208, 58166, 9550, 21961, 26614, 9589, 6649, 45148]
611431                [1240, 46376, 28611, 17057, 58166, 26614, 21961, 9550, 9589, 6649, 45148]
611432                                                    [1240, 12567, 5, 49267, 56981, 21423]
611433                     [1240, 36652, 52076, 17057, 58166, 40196, 21961, 42449, 9589, 35053]
611434                                            [1240, 1241, 28416, 4837, 42449, 9589, 35053]
Name: TEXT, dtype: object

Let's decode the sample:

In [21]:
dataset.tail().apply(decode)

611430    [<START>, commonwealth, bank, of, australia, upgraded, to, buy, from, hold, by, bell, potter]
611431                     [<START>, qbe, insurance, downgraded, to, hold, from, buy, by, bell, potter]
611432                                                      [<START>, collective, &, roland, team, for]
611433       [<START>, mcmillan, shakespeare, downgraded, to, neutral, from, outperform, by, macquarie]
611434                                       [<START>, <UNK>, initiated, at, outperform, by, macquarie]
Name: TEXT, dtype: object

#### Pad all sequences to maxlen (inferred from the dataset) to obtain equal length vectors

In [22]:
maxlen = dataset.apply(len).max() + 10
data = keras.preprocessing.sequence.pad_sequences(dataset.values,
                                           value=tokenizer.vocab["<PAD>"],
                                           padding='post',
                                           maxlen=maxlen)

#### Build the model

<img src="img/lstm_diagram.png">

In [26]:
neurons = 128
dropout = 0.40

my_embedding = layers.Embedding(input_dim=df_embedding.shape[0],
                                output_dim=df_embedding.shape[1],
                                input_length=maxlen,
                                trainable=False)

my_embedding.build((None, ))
my_embedding.set_weights([df_embedding.values])

model = keras.Sequential([
    my_embedding,
    layers.LSTM(neurons, return_sequences=True),
    layers.Dropout(dropout),
    layers.LSTM(neurons, return_sequences=False),
    layers.Dropout(dropout),
    layers.Dense(1, activation='tanh')
])

optimizer = tf.optimizers.Adam(learning_rate=0.001)

def r2_keras(y_true, y_pred):
    SS_res = K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return (1 - SS_res/(SS_tot + K.epsilon()))

model.compile(loss='mean_squared_error',
                    optimizer=optimizer,
              metrics=[r2_keras])
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 58, 300)           19349400  
_________________________________________________________________
lstm (LSTM)                  (None, 58, 128)           219648    
_________________________________________________________________
dropout (Dropout)            (None, 58, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 19,700,761
Trainable params: 351,361
Non-trainable params: 19,349,400
______________________________________

#### Prepare train/validation sets

In [27]:
train_pct = 0.90

y = news_data.SENTIMENT.values
y_train, y_test = y[:int(data.shape[0] * train_pct)], \
                  y[-int(data.shape[0] * (1-train_pct)):]

X_train, X_test = data[:int(data.shape[0] * train_pct)], \
                  data[-int(data.shape[0] * (1-train_pct)):], \

print('train: {:,}\ntest: {:,}'.format(len(X_train), len(X_test)))

train: 502,692
test: 55,854


#### Fit the model

In [28]:
model.fit(X_train, y_train,
          epochs = 10,
          batch_size = 1024,
          shuffle=True,
          validation_split=0.2,
          validation_data=(X_test, y_test))

Train on 502692 samples, validate on 55854 samples
Epoch 1/10
 23552/502692 [>.............................] - ETA: 10:58 - loss: 0.3332 - r2_keras: -0.4449

KeyboardInterrupt: 

In [45]:
model.fit(X_train, y_train,
          epochs = 10,
          batch_size = 1024,
          shuffle=True,
          validation_split=0.2,
          validation_data=(X_test, y_test))


Train on 502692 samples, validate on 55854 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f02a16dd0f0>

#### Save Model

In [None]:
# model.save('model/tf-lstm-model.h5')
    
# with open('rp_model/tokenizer.pkl', 'wb') as f:
#     pickle.dump(tokenizer, f)

#### Load Model

In [29]:
model.load_weights('model/tf-lstm-model.h5')

Here we create a simple helper function to format text data into machine-readable format

In [30]:
def predict(txt):
    if isinstance(txt, str):
        txt = keras.preprocessing.sequence.pad_sequences(
            [encode(txt)],
            value=tokenizer.vocab["<PAD>"],
            padding='post',
            maxlen=maxlen)
    return model.predict(txt)[0][0]

In [31]:
predict('CEO found guilty of fraud')

-0.54968196

In [32]:
predict('IBM misses earnings expectations')

-0.6829254

In [33]:
predict('''Morgan Stanley analyst Andrew Berens initiates coverage
           on Blueprint Medicines (NASDAQ:BPMC) with a Overweight''')

0.93568856

In [34]:
predict('''Senex Energy Upgraded to Outperform from Neutral by Credit Suisse''')

0.9526496

In [38]:
predict('''Qbe Insurance Downgraded to Hold from Buy by Bell Potter''')

-0.58416826

### Sentiment and Topic Prediction model: Keras functional API

#### Build the model

<img src="img/lstm_bi_2o_diagram.png">

#### Load data

In [39]:
news_data = pd.read_csv('data/news_data.csv',
                        usecols = ['TEXT', 'SENTIMENT', 'TOPIC'])
news_data.drop_duplicates(inplace=True)

In [40]:
news_data.tail()

Unnamed: 0,SENTIMENT,TOPIC,TEXT
611430,0.94,analyst-ratings-change,Commonwealth Bank of Australia Upgraded to Buy from Hold by Bell Potter
611431,-0.58,analyst-ratings-change,Qbe Insurance Downgraded to Hold from Buy by Bell Potter
611432,0.49,partnership,Collective & Roland Team for
611433,-0.58,analyst-ratings-change,Mcmillan Shakespeare Downgraded to Neutral from Outperform by Macquarie
611434,0.95,analyst-ratings-set,Nearmap Initiated at Outperform by Macquarie


In [41]:
from sklearn.preprocessing import LabelBinarizer
encoder_topic = preprocessing.LabelBinarizer()
encoder_topic.fit(news_data.TOPIC.values)
print('# topics: {}\n'.format(len(encoder_topic.classes_)))

# topics: 220



In [45]:
neurons = 128
dropout = 0.20
topic_output_length = encoder_topic.classes_.shape[0]

my_embedding = layers.Embedding(input_dim=df_embedding.shape[0],
                                output_dim=df_embedding.shape[1],
                                input_length=maxlen,
                                trainable=False,
                                name='embedding_layer'
                               )
my_embedding.build((None, ))
my_embedding.set_weights([df_embedding.values])

sequence_input = keras.layers.Input((maxlen, ), name='sequence_input')

X = my_embedding(sequence_input)
X = layers.Bidirectional(
    layers.LSTM(units=neurons,
                return_sequences=True,
                dropout=dropout),
    name='bi_lstm_1')(X)
X = layers.LSTM(units=neurons,
                return_sequences=False,
                dropout=dropout,
                name='lstm_1')(X)

# Two output layers: softmax for Topic and 1D tanh for sentiment
topic_output = layers.Dense(topic_output_length,
                            activation='softmax',
                            name='topic')(X)
sentiment = layers.Dense(units=1,
                         activation='tanh',
                         name='sentiment')(X)

model = keras.models.Model(inputs=[sequence_input],
                           outputs=[topic_output, sentiment],
                           name='topic_sent_model')

optimizer = tf.optimizers.Adam(learning_rate=0.0001)

def r_squared(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

model.compile(loss={'topic':'categorical_crossentropy',
                    'sentiment':'mean_squared_error'},
                    optimizer=optimizer,
              metrics={'topic':'accuracy',
                       'sentiment':r_squared})

model.summary()

Model: "topic_sent_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sequence_input (InputLayer)     [(None, 58)]         0                                            
__________________________________________________________________________________________________
embedding_layer (Embedding)     (None, 58, 300)      19349400    sequence_input[0][0]             
__________________________________________________________________________________________________
bi_lstm_1 (Bidirectional)       (None, 58, 256)      439296      embedding_layer[0][0]            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 128)          197120      bi_lstm_1[0][0]                  
___________________________________________________________________________________

#### Prepare train/validation sets

In [46]:
train_pct = 0.90
sent = news_data.SENTIMENT.values
topic = encoder_topic.transform(news_data.TOPIC.values)

X_train, X_test = data[:int(data.shape[0] * train_pct)], \
                  data[-int(data.shape[0] * (1-train_pct)):], \

topic_train, topic_test = topic[:int(data.shape[0] * train_pct)], \
                          topic[-int(data.shape[0] * (1-train_pct)):]

sent_train, sent_test = sent[:int(data.shape[0] * train_pct)], \
                        sent[-int(data.shape[0] * (1-train_pct)):]

print('train: {:,}\ntest: {:,}'.format(len(X_train), len(X_test)))

train: 502,692
test: 55,854


#### Fit the model

In [47]:
model.fit(X_train, [topic_train, sent_train],
          epochs = 10,
          batch_size = 1024,
          shuffle=True,
          validation_split=0.1,
          validation_data=(X_test, [topic_test, sent_test]))

W0806 18:09:48.327771 4793939392 deprecation.py:323] From /Users/mkangrga/anaconda3/envs/tf/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 502692 samples, validate on 55854 samples
Epoch 1/10
 26624/502692 [>.............................] - ETA: 19:21 - loss: 5.4235 - topic_loss: 5.1683 - sentiment_loss: 0.2552 - topic_accuracy: 0.0556 - sentiment_r_squared: -0.0954

KeyboardInterrupt: 

#### Load model

In [48]:
model.load_weights('model/tf-lstm-model-topic-sent.h5')
with open('model/encoder_topic.pkl', 'rb') as f:
    encoder_topic = pickle.load(f)
    
# model.save('model/tf-lstm-model-topic-sent.h5')
# with open('model/encoder_topic.pkl', 'wb') as f:
#     pickle.dump(encoder_topic, f)

In [49]:
def predict(txt):
    if isinstance(txt, str):
        txt = keras.preprocessing.sequence.pad_sequences(
            [encode(txt)],
            value=tokenizer.vocab["<PAD>"],
            padding='post',
            maxlen=maxlen)
        predictions = model.predict(txt)
        topic = encoder_topic.inverse_transform(predictions[0])[0]
        sentiment = predictions[1][0][0]
    return topic, sentiment

In [50]:
predict('CEO found guilty of fraud')

('executive-resignation', -0.6195352)

In [51]:
predict('IBM misses revenue expectations')

('revenue', -0.26360473)

In [52]:
predict('''Morgan Stanley analyst Andrew Berens initiates coverage
           on Blueprint Medicines (NASDAQ:BPMC) with a Overweight''')

('analyst-ratings-set', 0.9304361)

In [53]:
predict('''Senex Energy Upgraded to Outperform from Neutral by Credit Suisse''')

('analyst-ratings-change', 0.7553701)

In [54]:
predict('''Qbe Insurance Downgraded to Hold from Buy by Bell Potter''')

('analyst-ratings-change', -0.6919683)