## Import the libraries
Importing necessary libraries

In [None]:
import tensorflow as tf
import json
import nltk
from sklearn.model_selection import train_test_split

## Import the dataset

Import the training file, load it into your workspace and put the sentences and labels into lists. 

In [None]:
!sudo apt install jq
!rm data.json data.jsonl data.jsonl.gz
!wget https://huggingface.co/datasets/dair-ai/emotion/resolve/main/data/data.jsonl.gz
!gunzip data.jsonl.gz
!jq --slurp . < data.jsonl > data.json
!rm data.jsonl

Reading package lists... Done
Building dependency tree       
Reading state information... Done
jq is already the newest version (1.6-1ubuntu0.20.04.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
rm: cannot remove 'data.jsonl': No such file or directory
rm: cannot remove 'data.jsonl.gz': No such file or directory
--2023-05-31 05:37:13--  https://huggingface.co/datasets/dair-ai/emotion/resolve/main/data/data.jsonl.gz
Resolving huggingface.co (huggingface.co)... 99.84.191.118, 99.84.191.107, 99.84.191.66, ...
Connecting to huggingface.co (huggingface.co)|99.84.191.118|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/datasets/emotion/8944e6b35cb42294769ac30cf17bd006231545b2eeecfa59324246e192564d1f?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27data.jsonl.gz%3B+filename%3D%22data.jsonl.gz%22%3B&response-content-type=application%2Fgzip&Expires=1685770633&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cm

In [None]:
# Load the JSON file
with open("data.json", 'r') as f:
  datastore = json.load(f)

  # Initialize the lists
  sentences = []
  labels = []

  # Collect sentences and labels into the lists
  for item in datastore:
      sentences.append(item['text'])
      labels.append(item['label'])

In [None]:
vocab_size = 100000  # Maximum vocab size.
max_len = 32  # Sequence length to pad the outputs to.
embedding_dim = 32

In [None]:
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

## Model

Establish the text vectorization

In [None]:
vectorize_layer = tf.keras.layers.TextVectorization(
 max_tokens=vocab_size,
 standardize='lower',
 output_mode='int',
 output_sequence_length=max_len)

vectorize_layer.adapt(training_sentences)

Remove stop words

In [None]:
nltk.download('stopwords')

stop_words = stopwords.words('english')
vocabs = vectorize_layer.get_vocabulary()

for stop_word in stop_words:
  if stop_word in vectorize_layer.get_vocabulary():
    vocabs.remove(stop_word)

vectorize_layer.set_vocabulary(vocabs)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Build the Model

In [None]:
# Create the model that uses the vectorize text layer
model = tf.keras.models.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    vectorize_layer,
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.SpatialDropout1D(0.5),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(16, kernel_regularizer=tf.keras.regularizers.L2(0.001), activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])

# Summary of Model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 32)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 32, 32)            3200000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 32, 32)           0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 32, 64)           16640     
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, 30, 32)            6176      
                                                        

In [None]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
num_epochs = 30
early_stopping = tf.keras.callbacks.EarlyStopping(patience=3)

# Train the model
history = model.fit(training_sentences, training_labels, epochs=num_epochs, validation_data=(testing_sentences, testing_labels), callbacks=[early_stopping], verbose=2)

Epoch 1/30
10421/10421 - 863s - loss: 0.2902 - accuracy: 0.8819 - val_loss: 0.1482 - val_accuracy: 0.9238 - 863s/epoch - 83ms/step
Epoch 2/30
10421/10421 - 852s - loss: 0.1518 - accuracy: 0.9239 - val_loss: 0.1382 - val_accuracy: 0.9235 - 852s/epoch - 82ms/step
Epoch 3/30
10421/10421 - 847s - loss: 0.1388 - accuracy: 0.9259 - val_loss: 0.1327 - val_accuracy: 0.9233 - 847s/epoch - 81ms/step
Epoch 4/30
10421/10421 - 848s - loss: 0.1325 - accuracy: 0.9266 - val_loss: 0.1302 - val_accuracy: 0.9259 - 848s/epoch - 81ms/step
Epoch 5/30
10421/10421 - 881s - loss: 0.1285 - accuracy: 0.9275 - val_loss: 0.1293 - val_accuracy: 0.9256 - 881s/epoch - 85ms/step
Epoch 6/30
10421/10421 - 855s - loss: 0.1253 - accuracy: 0.9289 - val_loss: 0.1297 - val_accuracy: 0.9254 - 855s/epoch - 82ms/step
Epoch 7/30
10421/10421 - 864s - loss: 0.1224 - accuracy: 0.9299 - val_loss: 0.1310 - val_accuracy: 0.9251 - 864s/epoch - 83ms/step
Epoch 8/30


## Testing preview

In [None]:
import numpy as np

emotion_labels = ['Sadness', 'Joy', 'Love', 'Anger', 'Fear', 'Surprise']  # Emotion labels based on your dataset

user_inputs = testing_sentences[:10]
result_detail = 0

# Predict emotions for user inputs
predictions = model.predict(user_inputs)

# Print the predicted emotions and their percentages for each user input
for i, text in enumerate(user_inputs):
    print(f"Sentence: {text}")
    if i < len(predictions):
      if result_detail == 1:
        prediction = predictions[i]
        predicted_percentages = [percentage * 100 for percentage in prediction]
        emotion_percentages = {
            emotion: percentage
            for emotion, percentage in zip(emotion_labels, predicted_percentages)
        }
        print("Emotion Percentages:")
        for emotion, percentage in emotion_percentages.items():
            print(f"{emotion}: {percentage:.2f}%")
      if result_detail == 0:
        print(f"Predicted Emotion: {[emotion_labels[prediction.argmax()] for prediction in predictions][i]}")
    else:
        print("Unable to predict emotion for this sentence.")
    print()

## Convert into TensorFlow Lite
Converting model into TFLite then export it 

In [None]:
# Convert the TensorFlow model to TensorFlow Lite format
converter = tf.lite.TFLiteConverter.from_keras_model(model)

converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.experimental_new_converter=True
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
tf.lite.OpsSet.SELECT_TF_OPS]

tflite_model = converter.convert()

# Save the TensorFlow Lite model to a file
tflite_model_path = "./tflite_model_v3.2.tflite"
with open(tflite_model_path, 'wb') as f:
    f.write(tflite_model)

Convert the labels into .json

In [None]:
# Export the label dict
with open( './labels.json' , 'w' ) as file:
  json.dump( emotion_labels , file )

Save the model from local to drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

with open('/content/gdrive/MyDrive/Course/Bangkit/Capstone/TFLite Models/tflite_model_v3.2.tflite', 'w') as createdModel:
  createdModel.write('Model saved to drive')