# Imports

In [35]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input, LSTM, Dropout
from keras.layers import LSTM, Input

# Supporting Functions

In [36]:
def print_distribution(df, labels_column_name):
    n = df.shape[0]
    print("{} labels frequency:".format(labels_column_name))
    print("Value\tCount\tPercent")
    indeces = df[labels_column_name].value_counts().index.tolist()
    counts = df[labels_column_name].value_counts().tolist()
    for val, count in zip(indeces, counts):
        print("{}\t{}\t{}%".format(val, count, (count / float(n)) * 100))
    
def get_max_words(text_arr):
    max_words = 0
    for line in text_arr:
        num_words = len(line.split())
        if num_words > max_words:
            max_words = num_words
    return max_words

# Load data

In [37]:
# Load in the csv data
reddit_train_df = pd.read_csv("../../data/reddit/labeled/score10_all_sub_labeled_train.csv", index_col=0)
reddit_test_df = pd.read_csv("../../data/reddit/labeled/score10_all_sub_labeled_dev.csv", index_col=0)

# Data setup

In [38]:
# Determine max post length
max_words_train = get_max_words(reddit_train_df.title.values)
max_words_test = get_max_words(reddit_test_df.title.values)
max_words = max(max_words_train, max_words_test)
print("Max number of words per post: {}".format(max_words))

# Label and title columns in datasets
BTC_LABEL_COL, ETH_LABEL_COL, LTC_LABEL_COL = 'bitcoin_one', 'ethereum_one', 'litecoin_one'
TEXT_COL = 'title'

# Print info about each label
print("{} Set Distributions:\n".format('Train'))
print_distribution(reddit_train_df, BTC_LABEL_COL)
print_distribution(reddit_train_df, ETH_LABEL_COL)
print_distribution(reddit_train_df, LTC_LABEL_COL)
print("\n{} Set Distributions:\n".format('Test'))
print_distribution(reddit_test_df, BTC_LABEL_COL)
print_distribution(reddit_test_df, ETH_LABEL_COL)
print_distribution(reddit_test_df, LTC_LABEL_COL)

# Split into x_train and y_train
print('\nGetting x_train, y_train, x_test, and y_test...')
(x_train, y_train_btc, y_train_eth, y_train_ltc) = reddit_train_df[TEXT_COL].values, \
    reddit_train_df[BTC_LABEL_COL], reddit_train_df[ETH_LABEL_COL], reddit_train_df[LTC_LABEL_COL]
(x_test, y_test_btc, y_test_eth, y_test_ltc) = reddit_test_df[TEXT_COL].values, \
    reddit_test_df[BTC_LABEL_COL], reddit_test_df[ETH_LABEL_COL], reddit_test_df[LTC_LABEL_COL]

# Print info about train and test
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print("----------------------------")

Max number of words per post: 78
Train Set Distributions:

bitcoin_one labels frequency:
Value	Count	Percent
1	54066	57.88341095230448%
0	39339	42.11658904769552%
ethereum_one labels frequency:
Value	Count	Percent
1	50802	54.388951340934646%
0	42603	45.61104865906536%
litecoin_one labels frequency:
Value	Count	Percent
1	48582	52.012204914083824%
0	44823	47.987795085916176%

Test Set Distributions:

bitcoin_one labels frequency:
Value	Count	Percent
1	7242	53.4307215582116%
0	6312	46.569278441788406%
ethereum_one labels frequency:
Value	Count	Percent
1	7355	54.26442378633614%
0	6199	45.73557621366386%
litecoin_one labels frequency:
Value	Count	Percent
0	7138	52.66342039250406%
1	6416	47.33657960749594%

Getting x_train, y_train, x_test, and y_test...
93405 train sequences
13554 test sequences
----------------------------


# Model setup (part 1/3)

In [39]:
# Set hyperparameters
max_features = 200000 # Num words in our vocabulary 
maxlen = max_words  # cut texts after this number of words
batch_size = 32  # Mini-batch size
epochs = 10 

# Train tokenizer to create a vocabulary of words
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)

# Vectorize each headline
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

# Update x_train and x_test to be 'sequences' of data
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(train_sequences, maxlen=maxlen)
x_test = sequence.pad_sequences(test_sequences, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)  

Pad sequences (samples x time)
x_train shape: (93405, 78)
x_test shape: (13554, 78)


# Model setup (part 2/3)

In [40]:
# Build embedding layer using word2vec
EMBEDDING_FILE = "../../data/embeddings/GoogleNews-vectors-negative300.bin"
EMBEDDING_DIM = 300
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
        
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=maxlen,
        trainable=False) 

# Model setup (part 3/3)

In [43]:
# Get the actual embeddings
sequence_input = Input(shape=(maxlen,), dtype='int32')
embeddings = embedding_layer(sequence_input)

# Construct the model (attempt 1 - didn't work)
#X = LSTM(128, return_sequences=True)(embeddings)
#X = Dropout(0.5)(X)
#X = LSTM(128, return_sequences=False)(X)
#X = Dropout(0.5)(X)
#X = Dense(1, activation='sigmoid')(X)

# Construct the model (attempt 1 - didn't work)
X = LSTM(128, return_sequences=False)(embeddings)
X = Dense(1, activation='sigmoid')(X)

# BTC Model

In [44]:
# Define the BTC model
model_btc = Model(inputs=sequence_input, outputs=X)

# Compile the BTC model
model_btc.summary()
model_btc.compile(loss='binary_crossentropy', 
                    optimizer='adam', 
                    metrics=['accuracy'])

#loss = mean square error for regression

# Select BTC labels for y
y_train = y_train_btc
y_test = y_test_btc

# Run the BTC model
model_btc.fit(x_train, 
              y_train, 
              batch_size=batch_size, 
              epochs=epochs, 
              validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 78)                0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 78, 300)           11135400  
_________________________________________________________________
lstm_10 (LSTM)               (None, 128)               219648    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 11,355,177
Trainable params: 219,777
Non-trainable params: 11,135,400
_________________________________________________________________
Train on 93405 samples, validate on 13554 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


NameError: name 'model' is not defined

# Continue running BTC model

In [46]:
# Run the BTC model
model_btc.fit(x_train, 
              y_train, 
              batch_size=batch_size, 
              epochs=5, 
              validation_data=(x_test, y_test))

score, acc = model_btc.evaluate(x_test, 
                            y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 93405 samples, validate on 13554 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 1.5704745515637892
Test accuracy: 0.5082632432966189
