In [4]:
import numpy as np
import pandas as pd

# Load in the csv data
headlines_df = pd.read_csv("../../data/headlines/labeled/labeled_headlines.csv", index_col=0, parse_dates=[0])

print("There are {} headlines".format(headlines_df.shape[0]))
headlines_df.head()

There are 3758 headlines


Unnamed: 0,postdate,source,headline,btc_label,ltc_label,eth_label
0,2017-01-01 13:50:25,coindesk.com,coindesk's charles bovaird asks the experts fo...,1,1,1
1,2017-01-01 15:33:53,coindesk.com,the blockchain industry is likely to see growt...,1,1,1
2,2017-01-01 21:04:02,coindesk.com,"the price of bitcoin passed 1,000 during the ...",1,1,1
3,2017-01-02 12:07:00,coindesk.com,do 2016's political changes foreshadow blockch...,1,0,1
4,2017-01-02 14:43:09,coindesk.com,bitspark's george harrap lists his takeaways f...,1,0,1


In [4]:
def print_distribution(df, labels_column_name, data_set_name):
    '''
    :param df: DataFrame with data to print
    :param labels_column_name: name of labels column in df
    :param data_set_name: Data set printing stats of
    :return: nothing
    '''
    n = df.shape[0]
    print("{} Set Distributions:\n".format(data_set_name))
    print("{} labels frequency:".format(labels_column_name))
    print("Value\tCount\tPercent")
    indeces = df[labels_column_name].value_counts().index.tolist()
    counts = df[labels_column_name].value_counts().tolist()
    for val, count in zip(indeces, counts):
        print("{}\t{}\t{}%".format(val, count, (count / float(n)) * 100))
    print("\n")

###  2) Figure out some stuff about our data
    - What is the max number of words from all the headlines?
        - Need to know this for when we vectorize the words, we need to pad the vectors to all be the same length

In [5]:
def get_max_words(text_arr):
    max_words = 0
    for line in text_arr:
        num_words = len(line.split())
        if num_words > max_words:
            max_words = num_words
    return max_words

max_words = get_max_words(headlines_df.headline.values)
print("Max number of words per headline: {}".format(max_words))

Max number of words per headline: 30


### 3) Split the data
    - What are the distributions of each data set?

In [6]:
# How many headlines to use for training -- Had to use 3006 to make sure to not split between a day
LABEL_COL = 'btc_label'
TEXT_COL = 'headline'
train_size = 3006

train_df = headlines_df[:train_size].copy()
test_df = headlines_df[train_size:].copy()

print('Splitting data...')
(x_train, y_train) = train_df[TEXT_COL].values, train_df[LABEL_COL]
(x_test, y_test) = test_df[TEXT_COL].values, test_df[LABEL_COL]
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print("----------------------------")
print_distribution(headlines_df, 'btc_label', 'Full')
print_distribution(train_df, 'btc_label', 'Train')
print_distribution(test_df, 'btc_label', 'Test')

Splitting data...
3006 train sequences
752 test sequences
----------------------------
Full Set Distributions:

btc_label labels frequency:
Value	Count	Percent
1	2231	59.366684406599255%
0	1527	40.633315593400745%


Train Set Distributions:

btc_label labels frequency:
Value	Count	Percent
1	1824	60.67864271457086%
0	1182	39.321357285429144%


Test Set Distributions:

btc_label labels frequency:
Value	Count	Percent
1	407	54.12234042553191%
0	345	45.87765957446808%




### 3) Build the initial model

In [9]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM, Input


######### Hyperparameters
max_features = 20000 # Controls the number of words in our vocabulary 
maxlen = max_words  # cut texts after this number of words (among top max_features most common words)
batch_size = 32  # mini-batch size
epochs = 6    


#### Pre-process the data by using Keras Tokenizer 
    - similar to the sklearn CountVectorizer we used before, but more powerful
    - https://keras.io/preprocessing/text/

In [8]:
# First train our Tokenizer to create a vocabulary of words
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)

# Second vectorize each headline
# Might want to train a different tokenizer on the test set?
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

# For an RNN, ou need a 'sequence' of data as the input
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(train_sequences, maxlen=maxlen)
x_test = sequence.pad_sequences(test_sequences, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (3006, 30)
x_test shape: (752, 30)


### Build and train an RNN LSTM model with an Embedding layer input using the Keras Sequential API
 - Input = Embedding layer of size 128 (means it uses 128 features) Need to tune this probably
 - Hidden Layer = LSTM with 128 hidden units
 - Output = Sigmoid 
 - See: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py

In [20]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128)) #, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()
print('Train...')
model.fit(x_train, 
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)

print('Test score:', score)
print('Test accuracy:', acc)

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, None, 128)         2560000   
_________________________________________________________________
lstm_10 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________
Train...
Train on 3006 samples, validate on 752 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Test accuracy: 0.51329787234


### Create the same model using the Keras functional API
    - The functional API will be more useful due to adaptability; i.e. you can have mutiple inputs/outputs from the NN, which you can't do with the sequential API

In [21]:
# Functional API version of the same model
# The functional API is much more useful than the sequential API in terms of adaptability 
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')

embedding_layer = Embedding(output_dim=128, input_dim=max_features, input_length=maxlen)(input_layer)

lstm_layer = LSTM(128)(embedding_layer)

output = Dense(1, activation='sigmoid')(lstm_layer)

model = Model(inputs=input_layer, 
              outputs=output)
model.summary()

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.fit(x_train, 
          y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 30)                0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 30, 128)           2560000   
_________________________________________________________________
lstm_11 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________
Train on 3006 samples, validate on 752 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Test score: 2.24482708028
Test accuracy: 0.551861702128
