In [1]:
import numpy as np
import pandas as pd

# Load in the csv data
#headlines_df = pd.read_csv("../../data/headlines/labeled/labeled_headlines.csv", index_col=0, parse_dates=[0])
tweets_df = pd.read_csv('..\\..\\data\\twitter\\labeled_tweets.csv', encoding='ISO-8859-1')
#tweets_df = pd.read_csv('../../data/twitter/labeled_tweets.csv', encoding='ISO-8859-1')

print("There are {} tweets".format(tweets_df.shape[0]))
tweets_df.head()

There are 48262 tweets


Unnamed: 0,Date,Text,BTC_Label,ETH_Label,LTC_Label
0,2017-01-01,Did anyone notice Bitcoin hit $1000 a few hour...,1,1,0
1,2017-01-01,"Bitcoin Price Tops $1,000 in First Day of 2017...",1,1,0
2,2017-01-01,The latest Bitcoin Price Index is 997.75 USD h...,1,1,0
3,2017-01-01,Bitcoinã3å¹´ã¶ãã®1000ãã«ã«åãã£ã...,1,1,0
4,2017-01-01,#Bitcoin predictions from #WebBot @clif_high h...,1,1,0


In [2]:
def print_distribution(df, labels_column_name, data_set_name):
    '''
    :param df: DataFrame with data to print
    :param labels_column_name: name of labels column in df
    :param data_set_name: Data set printing stats of
    :return: nothing
    '''
    n = df.shape[0]
    print("{} Set Distributions:\n".format(data_set_name))
    print("{} labels frequency:".format(labels_column_name))
    print("Value\tCount\tPercent")
    indeces = df[labels_column_name].value_counts().index.tolist()
    counts = df[labels_column_name].value_counts().tolist()
    for val, count in zip(indeces, counts):
        print("{}\t{}\t{}%".format(val, count, (count / float(n)) * 100))
    print("\n")

###  2) Figure out some stuff about our data
    - What is the max number of words from all the tweets?
        - Need to know this for when we vectorize the words, we need to pad the vectors to all be the same length

In [3]:
def get_max_words(text_arr):
    max_words = 0
    for line in text_arr:
        num_words = len(str(line).split())
        if num_words > max_words:
            max_words = num_words
    return max_words

max_words = get_max_words(tweets_df.Text.values)
print("Max number of words per tweet: {}".format(max_words))

Max number of words per tweet: 99


### 3) Split the data
    - What are the distributions of each data set?

In [4]:
# How many tweets to use for training -- Had to use 38569 to make sure to not split between a day
LABEL_COL = 'LTC_Label'
TEXT_COL = 'Text'
train_size = 38569

train_df = tweets_df[:train_size].copy()
test_df = tweets_df[train_size:].copy()

print('Splitting data...')
(x_train, y_train) = train_df[TEXT_COL].values, train_df[LABEL_COL]
(x_test, y_test) = test_df[TEXT_COL].values, test_df[LABEL_COL]
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print("----------------------------")
print_distribution(tweets_df, LABEL_COL, 'Full')
print_distribution(train_df, LABEL_COL, 'Train')
print_distribution(test_df, LABEL_COL, 'Test')

Splitting data...
38569 train sequences
9693 test sequences
----------------------------
Full Set Distributions:

LTC_Label labels frequency:
Value	Count	Percent
1	24410	50.57809456715429%
0	23852	49.42190543284572%


Train Set Distributions:

LTC_Label labels frequency:
Value	Count	Percent
1	19924	51.65806735979673%
0	18645	48.341932640203275%


Test Set Distributions:

LTC_Label labels frequency:
Value	Count	Percent
0	5207	53.71917878881667%
1	4486	46.28082121118332%




### 3) Build the initial model

In [5]:
import keras
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM, Input

######### Hyperparameters
max_features = 20000 # Controls the number of words in our vocabulary 
maxlen = max_words  # cut texts after this number of words (among top max_features most common words)
batch_size = 32  # mini-batch size
epochs = 6    


Using TensorFlow backend.


#### Pre-process the data by using Keras Tokenizer 
    - similar to the sklearn CountVectorizer we used before, but more powerful
    - https://keras.io/preprocessing/text/

In [6]:
x_train = list(map(str, x_train))
x_test = list(map(str, x_test))

# First train our Tokenizer to create a vocabulary of words
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)

# Second vectorize each headline
# Might want to train a different tokenizer on the test set?
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

# For an RNN, ou need a 'sequence' of data as the input
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(train_sequences, maxlen=maxlen)
x_test = sequence.pad_sequences(test_sequences, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (38569, 99)
x_test shape: (9693, 99)


### Build and train an RNN LSTM model with an Embedding layer input using the Keras Sequential API
 - Input = Embedding layer of size 128 (means it uses 128 features) Need to tune this probably
 - Hidden Layer = LSTM with 128 hidden units
 - Output = Sigmoid 
 - See: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py

In [7]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128)) #, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()
print('Train...')
model.fit(x_train, 
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)

print('Test score:', score)
print('Test accuracy:', acc)

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________
Train...
Train on 38569 samples, validate on 9693 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Test score: 1.61374065076
Test accuracy: 0.501908593812


### Create the same model using the Keras functional API
    - The functional API will be more useful due to adaptability; i.e. you can have mutiple inputs/outputs from the NN, which you can't do with the sequential API

In [None]:
# Functional API version of the same model
# The functional API is much more useful than the sequential API in terms of adaptability 
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')

embedding_layer = Embedding(output_dim=128, input_dim=max_features, input_length=maxlen)(input_layer)

lstm_layer = LSTM(128)(embedding_layer)

output = Dense(1, activation='sigmoid')(lstm_layer)

model = Model(inputs=input_layer, 
              outputs=output)
model.summary()

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.fit(x_train, 
          y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)