###  1) Load data
    - Load both train and test datasets
    - Print info about each

In [24]:
import numpy as np
import pandas as pd

# Load in the csv data
reddit_train_df = pd.read_csv("../../data/reddit/labeled/all_sub_labeled_train.csv", index_col=0)
reddit_test_df = pd.read_csv("../../data/reddit/labeled/all_sub_labeled_dev.csv", index_col=0)

print("There are {} reddit posts for TRAIN".format(reddit_train_df.shape[0]))
reddit_train_df.head()
print("There are {} reddit posts for TEST".format(reddit_test_df.shape[0]))
reddit_test_df.head()


There are 320935 reddit posts for TRAIN
There are 75157 reddit posts for TEST


Unnamed: 0,date,subreddit,score,num_comments,title,bitcoin_one,bitcoin_two,ethereum_one,ethereum_two,litecoin_one,litecoin_two
320936,01/01/2018,bitcoinmarkets,4,11,Addon for Crypto Prices on Binance (firefox an...,1,1,1,1,0,0
320937,01/01/2018,cryptocurrency,13,0,Nice read on IOTA on hacked.com,1,1,1,1,0,0
320938,01/01/2018,cryptocurrency,8,5,"@iotatokennews is a fake account, no new excha...",1,1,1,1,0,0
320939,01/01/2018,cryptocurrency,3,1,MyWish vs Blockcat vs Etherparty Comparison,1,1,1,1,0,0
320940,01/01/2018,cryptocurrency,5,1,"NUCLEUS VISION ICO Review! IoT-based, Contactl...",1,1,1,1,0,0


###  2) Supporting functions
    - For printing label distributions
    - For determining max length of post titles

In [25]:
def print_distribution(df, labels_column_name):
    n = df.shape[0]
    print("{} labels frequency:".format(labels_column_name))
    print("Value\tCount\tPercent")
    indeces = df[labels_column_name].value_counts().index.tolist()
    counts = df[labels_column_name].value_counts().tolist()
    for val, count in zip(indeces, counts):
        print("{}\t{}\t{}%".format(val, count, (count / float(n)) * 100))
    
def get_max_words(text_arr):
    max_words = 0
    for line in text_arr:
        num_words = len(line.split())
        if num_words > max_words:
            max_words = num_words
    return max_words

###  3) Figure out some stuff about our data
    - What is the max number of words from all the reddit posts?
        - Need to know this for when we vectorize the words, we need to pad the vectors to all be the same length
    - What are the distributions of each data set?

In [26]:
max_words_train = get_max_words(reddit_train_df.title.values)
max_words_test = get_max_words(reddit_test_df.title.values)
max_words = max(max_words_train, max_words_test)

print("Max number of words per post: {}".format(max_words))

Max number of words per post: 78


In [27]:
# Label and title columns in datasets
BTC_LABEL_COL, ETH_LABEL_COL, LTC_LABEL_COL = 'bitcoin_one', 'ethereum_one', 'litecoin_one'
TEXT_COL = 'title'

# Print info about each label
print("{} Set Distributions:\n".format('Train'))
print_distribution(reddit_train_df, BTC_LABEL_COL)
print_distribution(reddit_train_df, ETH_LABEL_COL)
print_distribution(reddit_train_df, LTC_LABEL_COL)
print("\n{} Set Distributions:\n".format('Test'))
print_distribution(reddit_test_df, BTC_LABEL_COL)
print_distribution(reddit_test_df, ETH_LABEL_COL)
print_distribution(reddit_test_df, LTC_LABEL_COL)

# Print info about train and test set sizes
print('\nGetting x_train, y_train, x_test, and y_test...')
(x_train, y_train_btc, y_train_eth, y_train_ltc) = reddit_train_df[TEXT_COL].values, \
    reddit_train_df[BTC_LABEL_COL], reddit_train_df[ETH_LABEL_COL], reddit_train_df[LTC_LABEL_COL]
(x_test, y_test_btc, y_test_eth, y_test_ltc) = reddit_test_df[TEXT_COL].values, \
    reddit_test_df[BTC_LABEL_COL], reddit_test_df[ETH_LABEL_COL], reddit_test_df[LTC_LABEL_COL]

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print("----------------------------")

Train Set Distributions:

bitcoin_one labels frequency:
Value	Count	Percent
1	187970	58.5694922648%
0	132965	41.4305077352%
ethereum_one labels frequency:
Value	Count	Percent
1	172616	53.7853459423%
0	148319	46.2146540577%
litecoin_one labels frequency:
Value	Count	Percent
1	170138	53.0132269774%
0	150797	46.9867730226%

Test Set Distributions:

bitcoin_one labels frequency:
Value	Count	Percent
1	40244	53.5465758346%
0	34913	46.4534241654%
ethereum_one labels frequency:
Value	Count	Percent
1	42176	56.1171946725%
0	32981	43.8828053275%
litecoin_one labels frequency:
Value	Count	Percent
0	38514	51.2447277033%
1	36643	48.7552722967%

Getting x_train, y_train, x_test, and y_test...
(320935, 'train sequences')
(75157, 'test sequences')
----------------------------


### 4) Setup the basics for the initial model

In [28]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM, Input


######### Hyperparameters
max_features = 20000 # Controls the number of words in our vocabulary 
maxlen = max_words  # cut texts after this number of words (among top max_features most common words)
batch_size = 32  # mini-batch size
epochs = 6    


#### Pre-process the data by using Keras Tokenizer 
    - similar to the sklearn CountVectorizer we used before, but more powerful
    - https://keras.io/preprocessing/text/

In [29]:
# First train our Tokenizer to create a vocabulary of words
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)

# Second vectorize each headline
# Might want to train a different tokenizer on the test set?
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

# For an RNN, ou need a 'sequence' of data as the input
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(train_sequences, maxlen=maxlen)
x_test = sequence.pad_sequences(test_sequences, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
('x_train shape:', (320935, 78))
('x_test shape:', (75157, 78))


### 5) Model for BTC predictions (using Keras functional API)

#### Construct the model

In [30]:
# Functional API version of the same model
# The functional API is much more useful than the sequential API in terms of adaptability 
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')

embedding_layer = Embedding(output_dim=128, input_dim=max_features, input_length=maxlen)(input_layer)

lstm_layer = LSTM(128)(embedding_layer)

output = Dense(1, activation='sigmoid')(lstm_layer)

model = Model(inputs=input_layer, 
              outputs=output)
model.summary()

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 78)                0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 78, 128)           2560000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


#### Run the model

In [31]:
# Select BTC labels for y
y_train = y_train_btc
y_test = y_test_btc

# Run
model.fit(x_train, 
          y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 320935 samples, validate on 75157 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
('Test score:', 0.95107200784633228)
('Test accuracy:', 0.51397740729802976)


#### Continue running the model (6 epochs was not enough)

In [32]:
# Run
model.fit(x_train, 
          y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 320935 samples, validate on 75157 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
('Test score:', 1.6760358921898504)
('Test accuracy:', 0.5055417326444327)


### 5b) Model for BTC predictions (using only posts with score > 10)

In [34]:
# Load in the csv data
reddit_s10_train_df = pd.read_csv("../../data/reddit/labeled/score10_all_sub_labeled_train.csv", index_col=0)
reddit_s10_test_df = pd.read_csv("../../data/reddit/labeled/score10_all_sub_labeled_dev.csv", index_col=0)

# Determine max title length
max_words_train = get_max_words(reddit_s10_train_df.title.values)
max_words_test = get_max_words(reddit_s10_test_df.title.values)
max_words_s10 = max(max_words_train, max_words_test)
print("Max number of words per post: {}".format(max_words_s10))

# Split into x_train and y_train
print('\nGetting x_train, y_train, x_test, and y_test...')
(x_train_s10, y_train_btc_s10, y_train_eth_s10, y_train_ltc_s10) = reddit_s10_train_df[TEXT_COL].values, \
    reddit_s10_train_df[BTC_LABEL_COL], reddit_s10_train_df[ETH_LABEL_COL], reddit_s10_train_df[LTC_LABEL_COL]
(x_test_s10, y_test_btc_s10, y_test_eth_s10, y_test_ltc_s10) = reddit_s10_test_df[TEXT_COL].values, \
    reddit_s10_test_df[BTC_LABEL_COL], reddit_s10_test_df[ETH_LABEL_COL], reddit_s10_test_df[LTC_LABEL_COL]

# New model setup hyperparameter
maxlen_s10 = max_words_s10  # cut texts after this number of words (among top max_features most common words) 

# First train our Tokenizer to create a vocabulary of words
tokenizer_s10 = Tokenizer(num_words=max_features)
tokenizer_s10.fit_on_texts(x_train_s10)

# Second vectorize each headline
train_sequences_s10 = tokenizer_s10.texts_to_sequences(x_train_s10)
test_sequences_s10 = tokenizer_s10.texts_to_sequences(x_test_s10)

# For an RNN, ou need a 'sequence' of data as the input
print('Pad sequences (samples x time)')
x_train_s10 = sequence.pad_sequences(train_sequences_s10, maxlen=maxlen_s10)
x_test_s10 = sequence.pad_sequences(test_sequences_s10, maxlen=maxlen_s10)
print('x_train shape:', x_train_s10.shape)
print('x_test shape:', x_test_s10.shape)

# Construct the model
input_layer = Input(shape=(maxlen_s10,), dtype='int32', name='main_input')
embedding_layer = Embedding(output_dim=128, 
                            input_dim=max_features, 
                            input_length=maxlen_s10)(input_layer)
lstm_layer = LSTM(128)(embedding_layer)
output = Dense(1, activation='sigmoid')(lstm_layer)
model_btc_s10 = Model(inputs=input_layer, 
                       outputs=output)
model_btc_s10.summary()
model_btc_s10.compile(loss='binary_crossentropy', 
                      optimizer='adam', 
                      metrics=['accuracy'])

# Select BTC s10 labels for y
y_train_s10 = y_train_btc_s10
y_test_s10 = y_test_btc_s10

# Run the model
model_btc_s10.fit(x_train_s10, 
          y_train_s10, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(x_test_s10, y_test_s10))

score, acc = model.evaluate(x_test_s10, 
                            y_test_s10,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Max number of words per post: 78

Getting x_train, y_train, x_test, and y_test...
Pad sequences (samples x time)
('x_train shape:', (84543, 78))
('x_test shape:', (22416, 78))
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 78)                0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 78, 128)           2560000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________
Train on 84543 samples, validate on 22416 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/

#### Continue running the model (6 epochs was not enough)

In [35]:
# Run the model
model_btc_s10.fit(x_train_s10, 
          y_train_s10, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(x_test_s10, y_test_s10))

score, acc = model.evaluate(x_test_s10, 
                            y_test_s10,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 84543 samples, validate on 22416 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
('Test score:', 1.7078619607425773)
('Test accuracy:', 0.49741256245538901)


### 6) Model for ETH predictions (using Keras functional API)

#### Construct the model

In [None]:
# Functional API version of the same model
# The functional API is much more useful than the sequential API in terms of adaptability 
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')

embedding_layer = Embedding(output_dim=128, input_dim=max_features, input_length=maxlen)(input_layer)

lstm_layer = LSTM(128)(embedding_layer)

output = Dense(1, activation='sigmoid')(lstm_layer)

model_eth = Model(inputs=input_layer, 
              outputs=output)
model_eth.summary()

model_eth.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

#### Run the model

In [None]:
# Select ETH labels for y
y_train = y_train_eth
y_test = y_test_eth

# Run
model_eth.fit(x_train, 
          y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(x_test, y_test))

score, acc = model_eth.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

### 7) Model for LTC predictions (using Keras functional API)

#### Construct the model

In [None]:
# Functional API version of the same model
# The functional API is much more useful than the sequential API in terms of adaptability 
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')

embedding_layer = Embedding(output_dim=128, input_dim=max_features, input_length=maxlen)(input_layer)

lstm_layer = LSTM(128)(embedding_layer)

output = Dense(1, activation='sigmoid')(lstm_layer)

model_ltc = Model(inputs=input_layer, 
              outputs=output)
model_ltc.summary()

model_ltc.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

#### Run the model

In [None]:
# Select LTC labels for y
y_train = y_train_ltc
y_test = y_test_ltc

# Run
model_ltc.fit(x_train, 
          y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(x_test, y_test))

score, acc = model_ltc.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)