In [134]:
import tensorflow as tf
import pickle
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional, Concatenate

In [135]:
vocabulary = 5000
dimension_of_embedding = 64
words_in_a_text = 50
truncation = 'post'
padding = 'post'
not_in_vocab = '<<none>>'
training_size = 0.8

In [136]:
file = open('processed_list_of_tweets.pkl', 'rb')
text_data = pickle.load(file)
file.close()

In [137]:
file = open('processed_list_of_classes.pkl', 'rb')
text_classes = pickle.load(file)
file.close()

In [138]:
x = (int)(len(text_data) * training_size)

training_text = text_data[0:x]
training_classes = text_classes[0:x]

test_text = text_data[x:]
test_classes = text_classes[x:]

In [139]:
tokenizer = Tokenizer(num_words = vocabulary, oov_token= not_in_vocab)
tokenizer.fit_on_texts(training_text)
word_index = tokenizer.word_index

In [140]:
word_index

{'<<none>>': 1,
 'im': 2,
 'go': 3,
 'get': 4,
 'feel': 5,
 'day': 6,
 'like': 7,
 'love': 8,
 'dont': 9,
 'good': 10,
 'work': 11,
 'one': 12,
 'got': 13,
 'time': 14,
 'want': 15,
 'today': 16,
 'know': 17,
 'cant': 18,
 'make': 19,
 'peopl': 20,
 'u': 21,
 'hate': 22,
 'see': 23,
 'bitch': 24,
 'think': 25,
 'thank': 26,
 'happi': 27,
 'realli': 28,
 'back': 29,
 'need': 30,
 'look': 31,
 'lol': 32,
 'miss': 33,
 'ã': 34,
 'rt': 35,
 'new': 36,
 'still': 37,
 'come': 38,
 'say': 39,
 'mother': 40,
 'friend': 41,
 'home': 42,
 'well': 43,
 'much': 44,
 'night': 45,
 'watch': 46,
 'na': 47,
 'á': 48,
 'hope': 49,
 'would': 50,
 'amp': 51,
 'fuck': 52,
 'tri': 53,
 'thing': 54,
 'take': 55,
 'right': 56,
 'last': 57,
 'way': 58,
 'even': 59,
 'oh': 60,
 'that': 61,
 'sad': 62,
 'great': 63,
 'twitter': 64,
 'nigga': 65,
 'didnt': 66,
 'never': 67,
 'us': 68,
 'week': 69,
 'morn': 70,
 'wish': 71,
 'let': 72,
 'bad': 73,
 'call': 74,
 'ill': 75,
 'your': 76,
 'could': 77,
 'use': 78,
 '

In [141]:
training_text_sequences = tokenizer.texts_to_sequences(training_text)

training_text_padded = pad_sequences(training_text_sequences, maxlen=words_in_a_text, padding=padding, truncating=truncation)

In [142]:
test_text_sequences = tokenizer.texts_to_sequences(test_text)

test_text_padded = pad_sequences(test_text_sequences, maxlen=words_in_a_text, padding=padding, truncating=truncation)

In [143]:
print(set(text_classes))

{'hate', 'neutral', 'sad', 'anger', 'happy'}


In [144]:
class_tokenizer = Tokenizer()
class_tokenizer.fit_on_texts(text_classes)

training_classes_sequences = np.array(class_tokenizer.texts_to_sequences(training_classes))
test_classes_sequences = np.array(class_tokenizer.texts_to_sequences(test_classes))

In [145]:
model = Sequential()
model.add(Embedding(vocabulary,dimension_of_embedding))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(dimension_of_embedding)))
model.add(Dense(6, activation='softmax'))

model.summary()

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, None, 64)          320000    
_________________________________________________________________
dropout_19 (Dropout)         (None, None, 64)          0         
_________________________________________________________________
bidirectional_23 (Bidirectio (None, 128)               66048     
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 774       
Total params: 386,822
Trainable params: 386,822
Non-trainable params: 0
_________________________________________________________________


In [146]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

In [147]:
history = model.fit(training_text_padded, training_classes_sequences, epochs=100, validation_data=(test_text_padded, test_classes_sequences), verbose=2)


Train on 50512 samples, validate on 12628 samples
Epoch 1/100
50512/50512 - 137s - loss: 1.0778 - acc: 0.5494 - val_loss: 0.9372 - val_acc: 0.6268
Epoch 2/100
50512/50512 - 150s - loss: 0.8843 - acc: 0.6492 - val_loss: 0.9245 - val_acc: 0.6309
Epoch 3/100
50512/50512 - 153s - loss: 0.8364 - acc: 0.6710 - val_loss: 0.9371 - val_acc: 0.6265
Epoch 4/100
50512/50512 - 145s - loss: 0.8024 - acc: 0.6863 - val_loss: 0.9492 - val_acc: 0.6240
Epoch 5/100
50512/50512 - 137s - loss: 0.7716 - acc: 0.6961 - val_loss: 0.9682 - val_acc: 0.6216
Epoch 6/100
50512/50512 - 136s - loss: 0.7460 - acc: 0.7084 - val_loss: 0.9743 - val_acc: 0.6224
Epoch 7/100
50512/50512 - 137s - loss: 0.7159 - acc: 0.7201 - val_loss: 1.0121 - val_acc: 0.6178
Epoch 8/100
50512/50512 - 135s - loss: 0.6891 - acc: 0.7302 - val_loss: 1.0400 - val_acc: 0.6129
Epoch 9/100
50512/50512 - 135s - loss: 0.6660 - acc: 0.7396 - val_loss: 1.0702 - val_acc: 0.6107
Epoch 10/100
50512/50512 - 139s - loss: 0.6439 - acc: 0.7478 - val_loss: 1.10

Epoch 85/100
50512/50512 - 254s - loss: 0.2347 - acc: 0.9126 - val_loss: 2.3006 - val_acc: 0.5697
Epoch 86/100


KeyboardInterrupt: 