In [8]:
with open('train.txt') as f:
    raw_lines_train = f.readlines()

raw_lines_train[:3]

['i didnt feel humiliated;sadness\n',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake;sadness\n',
 'im grabbing a minute to post i feel greedy wrong;anger\n']

In [9]:
with open('val.txt') as f:
    raw_lines_val = f.readlines()

raw_lines_val[:3]

['im feeling quite sad and sorry for myself but ill snap out of it soon;sadness\n',
 'i feel like i am still looking at a blank canvas blank pieces of paper;sadness\n',
 'i feel like a faithful servant;love\n']

In [25]:
def replace_text_and_labels(array):

    line_text = []
    line_label = []

    for line in array:
        line = line.replace('\n','').split(';')
        line_text.append(line[0])
        line_label.append(line[1])
        # print(line)

    return line_text, line_label

train_text, train_label = replace_text_and_labels(raw_lines_train)
val_text, val_label = replace_text_and_labels(raw_lines_val)

print(train_text[0], end=' : ')
print(train_label[0])

print(val_text[7], end=' : ')
print(val_label[7])

i didnt feel humiliated : sadness
i feel incredibly lucky just to be able to talk to her : joy


In [44]:
len_array = []
for line in train_text:
    len_array.append(len(line))

sum(len_array) / len(len_array)

96.8458125

In [53]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_vocab = 10000 # 15214 max for this data
max_seq_length = 100 # max length our sequences will be 

text_vectorizer = TextVectorization(
    max_tokens = max_vocab,
    standardize="lower_and_strip_punctuation",
    output_mode = "int",
    output_sequence_length = max_seq_length
)

text_vectorizer.adapt(train_text)

In [54]:
train_token = text_vectorizer(train_text)
val_token = text_vectorizer(val_text)

print( train_token.shape )
print( val_token.shape )

(16000, 100)
(2000, 100)


In [57]:
train_token[5126]

<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([   2,   21,   32,   48, 5340,  128, 2818, 5040,    4,  259,   27,
         23,   61,    6,  179,   52,   12,    7,  187,    8,    2,   93,
        547,   36,  143,    1, 2304, 5651, 3162,    1,   90,  153,    2,
         21,  433,   15,   86,   52,   25,   11,  173,  151,   18,    3,
        284,  618,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int64)>

In [60]:
print(f"{len(list(set(train_label)))} unique values in label")
print(list(set(train_label)))

6 unique values in label
['surprise', 'sadness', 'anger', 'joy', 'fear', 'love']


In [63]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_label)

token_train_label = tokenizer.texts_to_sequences(train_label)
token_val_label = tokenizer.texts_to_sequences(val_label)

tokenizer.word_index

{'joy': 1, 'sadness': 2, 'anger': 3, 'fear': 4, 'love': 5, 'surprise': 6}

In [82]:
from tensorflow.keras.utils import to_categorical

cat_train_labels = to_categorical(token_train_label)
cat_val_labels = to_categorical(token_val_label)

# the 0th value is always going to be 0 because word index doesnt have anything in dictionary for '0'
# so we can remove it from the arrays
# cat_train_labels = temp_cat_train_labels[ : , 1:]
# cat_val_labels = temp_cat_val_labels[ : , 1:]

print(cat_train_labels.shape)
print(cat_val_labels.shape)

print(cat_train_labels[:5])  

(16000, 7)
(2000, 7)
[[0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]]


In [83]:
import tensorflow as tf
from tensorflow.keras import layers

model = tf.keras.models.Sequential([

    layers.Embedding(
        input_dim = max_vocab,
        output_dim = 320,
        input_length = max_seq_length
    ),

    layers.Bidirectional( layers.LSTM(128) ),

    layers.Dense(64, activation='relu'),

    layers.Dense(7, activation='softmax')

])

model.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 320)          3200000   
                                                                 
 bidirectional (Bidirectiona  (None, 256)              459776    
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dense_1 (Dense)             (None, 7)                 455       
                                                                 
Total params: 3,676,679
Trainable params: 3,676,679
Non-trainable params: 0
_________________________________________________________________


In [86]:
history = model.fit(
    train_token[:2000], cat_train_labels[:2000],
    epochs = 5,
    validation_data = (val_token[:500], cat_val_labels[:500])
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [103]:
test = ["i dont want to lose him"]
vect_test = text_vectorizer(test)

vect_test

<tf.Tensor: shape=(1, 100), dtype=int64, numpy=
array([[  2,  88,  67,   5, 811,  74,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int64)>

In [104]:
prediction = model.predict(vect_test)

test_labels = ['nan', 'joy', 'sadness', 'anger', 'fear', 'love', 'surprise']

for i in range(len(prediction[0])):
    print(f"{(prediction[0][i]*100)//1.00} % its {test_labels[i]}")

0.0 % its nan
27.0 % its joy
0.0 % its sadness
0.0 % its anger
0.0 % its fear
69.0 % its love
2.0 % its surprise
