In [204]:
import tensorflow as tf
import pickle
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional, Concatenate

In [205]:
vocabulary = 5000
dimension_of_embedding = 64
words_in_a_text = 50
truncation = 'post'
padding = 'post'
not_in_vocab = '<<none>>'
training_size = 0.8

In [206]:
file = open('processed_list_of_tweets2.pkl', 'rb')
text_data = pickle.load(file)
file.close()

In [207]:
file = open('processed_list_of_classes2.pkl', 'rb')
text_classes = pickle.load(file)
file.close()

In [208]:
x = (int)(len(text_data) * training_size)

training_text = text_data[0:x]
training_classes = text_classes[0:x]

test_text = text_data[x:]
test_classes = text_classes[x:]

In [209]:
tokenizer = Tokenizer(num_words = vocabulary, oov_token= not_in_vocab)
tokenizer.fit_on_texts(training_text)
word_index = tokenizer.word_index

In [210]:
word_index

{'<<none>>': 1,
 'im': 2,
 'feel': 3,
 'go': 4,
 'day': 5,
 'get': 6,
 'like': 7,
 'love': 8,
 'dont': 9,
 'work': 10,
 'good': 11,
 'got': 12,
 'one': 13,
 'time': 14,
 'know': 15,
 'want': 16,
 'today': 17,
 'see': 18,
 'u': 19,
 'cant': 20,
 'bitch': 21,
 'hate': 22,
 'peopl': 23,
 'make': 24,
 'think': 25,
 'thank': 26,
 'realli': 27,
 'back': 28,
 'happi': 29,
 'lol': 30,
 'need': 31,
 'ã': 32,
 'look': 33,
 'miss': 34,
 'rt': 35,
 'new': 36,
 'still': 37,
 'mother': 38,
 'á': 39,
 'hope': 40,
 'come': 41,
 'friend': 42,
 'well': 43,
 'night': 44,
 'much': 45,
 'say': 46,
 'home': 47,
 'amp': 48,
 'would': 49,
 'watch': 50,
 'na': 51,
 'fuck': 52,
 'tri': 53,
 'thing': 54,
 'that': 55,
 'oh': 56,
 'right': 57,
 'last': 58,
 'take': 59,
 'way': 60,
 'nigga': 61,
 'great': 62,
 'twitter': 63,
 'even': 64,
 'wish': 65,
 'didnt': 66,
 'us': 67,
 'week': 68,
 'morn': 69,
 'ill': 70,
 'call': 71,
 'bad': 72,
 'let': 73,
 'show': 74,
 'never': 75,
 'sad': 76,
 'could': 77,
 'wait': 78,
 

In [211]:
training_text_sequences = tokenizer.texts_to_sequences(training_text)

training_text_padded = pad_sequences(training_text_sequences, maxlen=words_in_a_text, padding=padding, truncating=truncation)

In [212]:
test_text_sequences = tokenizer.texts_to_sequences(test_text)

test_text_padded = pad_sequences(test_text_sequences, maxlen=words_in_a_text, padding=padding, truncating=truncation)

In [213]:
print(set(text_classes))

{'anger', 'happy', 'neutral', 'hate', 'sad'}


In [214]:
class_tokenizer = Tokenizer()
class_tokenizer.fit_on_texts(text_classes)

training_classes_sequences = np.array(class_tokenizer.texts_to_sequences(training_classes))
test_classes_sequences = np.array(class_tokenizer.texts_to_sequences(test_classes))

In [232]:
model = Sequential()
model.add(Embedding(vocabulary,dimension_of_embedding))
model.add(Dropout(0.5))
#model.add(Bidirectional(LSTM(dimension_of_embedding, return_sequences = True)))
model.add(Bidirectional(LSTM(dimension_of_embedding)))
model.add(Dense(6, activation='softmax'))

model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, None, 64)          320000    
_________________________________________________________________
dropout_15 (Dropout)         (None, None, 64)          0         
_________________________________________________________________
bidirectional_21 (Bidirectio (None, 128)               66048     
_________________________________________________________________
dense_15 (Dense)             (None, 6)                 774       
Total params: 386,822
Trainable params: 386,822
Non-trainable params: 0
_________________________________________________________________


In [233]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

In [234]:
history = model.fit(training_text_padded, training_classes_sequences, epochs=2, validation_data=(test_text_padded, test_classes_sequences), verbose=2)


Train on 47093 samples, validate on 11774 samples
Epoch 1/2
47093/47093 - 210s - loss: 1.0729 - acc: 0.5461 - val_loss: 0.9039 - val_acc: 0.6368
Epoch 2/2
47093/47093 - 203s - loss: 0.8621 - acc: 0.6519 - val_loss: 0.8773 - val_acc: 0.6491


In [222]:
 model.save('my_model.h5')