# Imports

In [16]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input, LSTM, Dropout
from keras.layers import LSTM, Input


# Supporting Functions

In [17]:
def print_distribution(df, labels_column_name):
    n = df.shape[0]
    print("{} labels frequency:".format(labels_column_name))
    print("Value\tCount\tPercent")
    indeces = df[labels_column_name].value_counts().index.tolist()
    counts = df[labels_column_name].value_counts().tolist()
    for val, count in zip(indeces, counts):
        print("{}\t{}\t{}%".format(val, count, (count / float(n)) * 100))
    
def get_max_words(text_arr):
    max_words = 0
    for line in text_arr:
        num_words = len(line.split())
        if num_words > max_words:
            max_words = num_words
    return max_words

# Load data

In [18]:
# Load in the csv data
reddit_train_df = pd.read_csv("../../data/reddit/labeled/score10_all_sub_labeled_train.csv", index_col=0)
reddit_test_df = pd.read_csv("../../data/reddit/labeled/score10_all_sub_labeled_dev.csv", index_col=0)

# Data setup

In [19]:
# Determine max post length
max_words_train = get_max_words(reddit_train_df.title.values)
max_words_test = get_max_words(reddit_test_df.title.values)
max_words = max(max_words_train, max_words_test)
print("Max number of words per post: {}".format(max_words))

# Label and title columns in datasets
BTC_LABEL_COL, ETH_LABEL_COL, LTC_LABEL_COL = 'bitcoin_one', 'ethereum_one', 'litecoin_one'
TEXT_COL = 'title'

# Print info about each label
print("{} Set Distributions:\n".format('Train'))
print_distribution(reddit_train_df, BTC_LABEL_COL)
print_distribution(reddit_train_df, ETH_LABEL_COL)
print_distribution(reddit_train_df, LTC_LABEL_COL)
print("\n{} Set Distributions:\n".format('Test'))
print_distribution(reddit_test_df, BTC_LABEL_COL)
print_distribution(reddit_test_df, ETH_LABEL_COL)
print_distribution(reddit_test_df, LTC_LABEL_COL)

# Split into x_train and y_train
print('\nGetting x_train, y_train, x_test, and y_test...')
(x_train, y_train_btc, y_train_eth, y_train_ltc) = reddit_train_df[TEXT_COL].values, \
    reddit_train_df[BTC_LABEL_COL], reddit_train_df[ETH_LABEL_COL], reddit_train_df[LTC_LABEL_COL]
(x_test, y_test_btc, y_test_eth, y_test_ltc) = reddit_test_df[TEXT_COL].values, \
    reddit_test_df[BTC_LABEL_COL], reddit_test_df[ETH_LABEL_COL], reddit_test_df[LTC_LABEL_COL]

# Print info about train and test
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print("----------------------------")

Max number of words per post: 78
Train Set Distributions:

bitcoin_one labels frequency:
Value	Count	Percent
1	49501	58.551269768047035%
0	35042	41.448730231952965%
ethereum_one labels frequency:
Value	Count	Percent
1	45893	54.283618986787786%
0	38650	45.71638101321221%
litecoin_one labels frequency:
Value	Count	Percent
1	44382	52.4963627976296%
0	40161	47.50363720237039%

Test Set Distributions:

bitcoin_one labels frequency:
Value	Count	Percent
1	11807	52.672198429693076%
0	10609	47.327801570306924%
ethereum_one labels frequency:
Value	Count	Percent
1	12263	54.7064596716631%
0	10153	45.2935403283369%
litecoin_one labels frequency:
Value	Count	Percent
0	11800	52.64097073518915%
1	10616	47.35902926481085%

Getting x_train, y_train, x_test, and y_test...
84543 train sequences
22416 test sequences
----------------------------


# Model setup

In [None]:
# Set hyperparameters
max_features = 200000 # Num words in our vocabulary 
maxlen = max_words  # cut texts after this number of words
batch_size = 32  # Mini-batch size
epochs = 10 

# Build embedding layer using word2vec
EMBEDDING_FILE = "../../data/embeddings/GoogleNews-vectors-negative300.bin"
EMBEDDING_DIM = 300
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
        
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=maxlen,
        trainable=False)  

# BTC Model

In [21]:
# First train our Tokenizer to create a vocabulary of words
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)

# Second vectorize each headline
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

# For an RNN, ou need a 'sequence' of data as the input
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(train_sequences, maxlen=maxlen)
x_test = sequence.pad_sequences(test_sequences, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)  

# Get the actual embeddings
sequence_input = Input(shape=(maxlen,), dtype='int32')
embeddings = embedding_layer(sequence_input)

# Construct the model
X = LSTM(128, return_sequences=True)(embeddings)
X = Dropout(0.5)(X)
X = LSTM(128, return_sequences=False)(X)
X = Dropout(0.5)(X)
X = Dense(1, activation='sigmoid')(X)

model_btc = Model(inputs=sequence_input, outputs=X)

#lstm_layer = LSTM(128)(embedded_sequences_1)
#output = Dense(1, activation='sigmoid')(lstm_layer)

# Run the model
model_btc.summary()
model_btc.compile(loss='binary_crossentropy', 
                      optimizer='adam', 
                      metrics=['accuracy'])

# Select BTC s10 labels for y
y_train = y_train_btc
y_test = y_test_btc

# Run the model
model_btc.fit(x_train, 
          y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Pad sequences (samples x time)
x_train shape: (84543, 78)
x_test shape: (22416, 78)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 78)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 78, 300)           10570800  
_________________________________________________________________
lstm_2 (LSTM)                (None, 78, 128)           219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 78, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
__________________________________________________________

KeyboardInterrupt: 