In [1]:
import re
import os
import keras
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from pathlib import Path

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/haiduong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/haiduong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
PROJECT_DIR = str(Path.cwd().parent) 

# Load dataset
There are two main functions in this part:
- A function to read [train, test, dev] dataset file.
- Another function to map label.

In [8]:
import s3fs
import h5py
import tensorflow as tf

def load_ds(dataset_type: str, key: str, secret: str, endpoint_url: str):
    s3 = s3fs.S3FileSystem(
        anon=False, 
        key=key, 
        secret=secret, 
        endpoint_url=endpoint_url
    )

    features = []

    with s3.open(f's3://emotiai/goemotion/{dataset_type}.h5', 'rb') as f:
        h5_file = h5py.File(f, 'r')

        # Stack all tensors into a single tensor (if they have the same shape)
        features = h5_file["features"]
        tensored_features = tf.convert_to_tensor(features)

        labels_dataset = h5_file['labels']
        tensored_labels = tf.convert_to_tensor(labels_dataset[:])  
 
    return tensored_features, tensored_labels

In [9]:
import tensorflow as tf
import keras
from keras.api.models import Model

class MLPConfig:
    MAX_TOKEN=20000
    SEQUENCE_LENGTH=13
    EMBEDDING_SIZE=128
    NUM_CLASSES=28
    INITIALIZER="uniform"
    OUTPUT_MODE="int"

class MLP(Model):
    def __init__(self,
                 vectorizer,
                 max_token=MLPConfig.MAX_TOKEN,
                 sequence_length=MLPConfig.SEQUENCE_LENGTH,
                 embedding_size=MLPConfig.EMBEDDING_SIZE, 
                 num_classes=MLPConfig.NUM_CLASSES,
                 initializer=MLPConfig.INITIALIZER,
                 **kwargs):
        super().__init__(**kwargs)
        self.max_token = max_token
        self.sequence_length = sequence_length
        self.embedding_size = embedding_size
        self.initializer = initializer
        self.num_classes = num_classes

        self.vectorizer = vectorizer
        
        self.embedding = keras.layers.Embedding(input_dim=self.max_token, # set input shape
                                            output_dim=self.embedding_size, # set size of embedding vector
                                            embeddings_initializer=self.initializer, # default, intialize randomly
                                            input_length=self.sequence_length, # how long is each input
                                            name="embedding_1") 
        self.dense_layer_1 = keras.layers.Dense(512, activation="relu")
        self.dense_layer_2 = keras.layers.Dense(256, activation="relu")
        self.global_average_1d = keras.layers.GlobalAveragePooling1D()
        self.output_layer = keras.layers.Dense(self.num_classes, activation="sigmoid")
        self.input_shape = (1,)  

    def call(self, inputs):
        tf.random.set_seed(42)

        x = self.vectorizer(inputs)
        x = self.embedding(x)
        x = self.dense_layer_1(x)
        x = self.dense_layer_2(x)
        x = self.global_average_1d(x)
        outputs = self.output_layer(x)
        
        return outputs

    def get_config(self):
        # Return only custom parameters
        return {
            'vectorizer': self.vectorizer,
            'max_token': self.max_token,
            'sequence_length': self.sequence_length,
            'embedding_size': self.embedding_size,
            'num_classes': self.num_classes,
            'initializer': self.initializer,
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [10]:
from keras.api.metrics import F1Score, Precision, Recall

def classification_metrics(average: str = None):
    f1_name = f'_{average}'
    if average == None:
        f1_name = ''

    return [F1Score(
        name=f'f1_{f1_name}',
        average=average,
    ), 'binary_accuracy', Precision(name="precision"), Recall(name="recall")]
    

In [14]:
import tensorflow as tf

class MLPConfig:
    MAX_TOKEN=20000
    SEQUENCE_LENGTH=13
    EMBEDDING_SIZE=128
    NUM_CLASSES=28
    INITIALIZER="uniform"
    OUTPUT_MODE="int"

X_train, y_train = load_ds("train", key="minio_access_key", secret="minio_secret_key", endpoint_url="http://localhost:9000")
print(X_train)
X_dev, y_dev = load_ds("dev", key="minio_access_key", secret="minio_secret_key", endpoint_url="http://localhost:9000")

# vectorizer = keras.layers.TextVectorization(
#     max_tokens=MLPConfig.MAX_TOKEN,
#     output_mode=MLPConfig.OUTPUT_MODE,
#     output_sequence_length=int(MLPConfig.SEQUENCE_LENGTH),
#     standardize=None
# )
# vectorizer.adapt(X_train)
# mlp = MLP(vectorizer=vectorizer)

# metrics = classification_metrics("macro")
# mlp.compile(
#         loss="binary_crossentropy",
#         optimizer="adam",
#         metrics=metrics
#     )

# mlp.summary()
# mlp.fit(X_train, y_train, epochs=1, validation_data=(X_dev, y_dev))

tf.Tensor(
[b'My favourite food anything I cook myself.'
 b'Now himself, everyone think he laugh screwing people instead actually dead'
 b'WHY THE FUCK IS BAYLESS ISOING' ...
 b'What talking about? Anything bad happened [NAME] fault - good thing [NAME] doing!'
 b'More like baptism, sexy results!' b'Enjoy ride!'], shape=(43410,), dtype=string)


In [13]:
mlp.summary()

In [4]:
path = PROJECT_DIR + "/data/original"

ds = read_ds_file(path)
label = read_label_file(path)

# Data preprocessing
In this part, i will create a function to handle the following preprocessing step:
- Remove extra space
- Word tokenization
- Remove stop words
- Lemmatize word

In [5]:
def remove_extra_space(ds: dict):
    for _, value in ds.items():
        for x in value:
            x["text"] = " ".join(x["text"].split())

def tokenize_sequence(ds: dict):
    for _, value in ds.items():
        for x in value:
            x["tokenized"] = x["text"].split(" ")

def lemmatize(ds: dict):
    stopword = set(stopwords.words('english')) 
    lemmatizer = nltk.WordNetLemmatizer()
    for _, value in ds.items():
        for x in value:
            x["tokenized"] = [lemmatizer.lemmatize(token) for token in x["tokenized"] if token not in stopword]

def preprocess_data(ds: dict):
    remove_extra_space(ds)
    tokenize_sequence(ds)
    lemmatize(ds)

    for _, value in ds.items():
        for x in value:
            x["preprocessed_text"] = ' '.join(x["tokenized"])

preprocess_data(ds)


In [6]:
ds['train'][:5]

[{'text': "My favourite food is anything I didn't have to cook myself.",
  'label': [27],
  'tokenized': ['My', 'favourite', 'food', 'anything', 'I', 'cook', 'myself.'],
  'preprocessed_text': 'My favourite food anything I cook myself.'},
 {'text': 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead',
  'label': [27],
  'tokenized': ['Now',
   'himself,',
   'everyone',
   'think',
   'he',
   'laugh',
   'screwing',
   'people',
   'instead',
   'actually',
   'dead'],
  'preprocessed_text': 'Now himself, everyone think he laugh screwing people instead actually dead'},
 {'text': 'WHY THE FUCK IS BAYLESS ISOING',
  'label': [2],
  'tokenized': ['WHY', 'THE', 'FUCK', 'IS', 'BAYLESS', 'ISOING'],
  'preprocessed_text': 'WHY THE FUCK IS BAYLESS ISOING'},
 {'text': 'To make her feel threatened',
  'label': [14],
  'tokenized': ['To', 'make', 'feel', 'threatened'],
  'preprocessed_text': 'To make feel threatened'},
 {'text': 'Dirt

# Prepare data
- This is the final step in data processing step where you have to vectorize tokenized words. There are two main parameters that you need to know: 
    - max_token: The maximum number of vocabulary
    - output_mode: "tf_idf"
    - sequence_length: The sample with the number of tokens smaller than this param will be padded with 0 to match the length. You can take the median or the mean from amount of tokens in all samples. For me, i often use the 95% percentile on the train dataset.
- Transform label using MultiLabelBinarizer() from sklearn

In [7]:
class DatasetConfig:
    MAX_TOKEN=20000
    EMBEDDING_DIM = 128


In [8]:
def find_95_percentile(ds: list):
    return np.percentile([len(x['tokenized']) for x in ds], 95)

In [9]:
def prepare_data(ds, output_mode):
    prepared_data = {x: {} for x in ["train", "test", "dev"]}

    sequence_length = find_95_percentile(ds["train"])

    if output_mode == "int":
        vectorizer = keras.layers.TextVectorization(
            max_tokens=DatasetConfig.MAX_TOKEN,
            output_mode=output_mode,
            output_sequence_length=int(sequence_length),
            standardize=None
        )
    else:
        vectorizer = keras.layers.TextVectorization(
            max_tokens=DatasetConfig.MAX_TOKEN,
            output_mode=output_mode,
            standardize=None
        )
        
    mlb = MultiLabelBinarizer()

    for key, value in ds.items():
        if key == "train":
            vectorizer.adapt([x["preprocessed_text"] for x in value])
            vocab = vectorizer.get_vocabulary()

            labels = mlb.fit_transform([x["label"] for x in value])
        else:
            labels = mlb.transform([x["label"] for x in value])

        features = vectorizer([x["preprocessed_text"] for x in value])  

        prepared_data[key]["features"] = features
        prepared_data[key]["labels"] = labels

    return vocab, prepared_data

In [10]:
vocab, prepared_data = prepare_data(ds, output_mode="tf_idf") # You can change the output_mode to either "tf_idf" or "int"

# Build model
- Here are the list of experiment that I will perform in this notebook:
    - Random Forest
    - MLP
    - 1D Convolution
    - Feature extraction + Bert

In [11]:
X_train, y_train = prepared_data["train"]["features"], prepared_data["train"]["labels"]
X_test, y_test = prepared_data["test"]["features"], prepared_data["test"]["labels"]
X_dev, y_dev = prepared_data["dev"]["features"], prepared_data["dev"]["labels"]

In [136]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

- MLP

In [152]:
vocab, int_prepared_data = prepare_data(ds, output_mode="int")

In [155]:
X_train_int, y_train_int = int_prepared_data["train"]["features"], int_prepared_data["train"]["labels"]
X_test_int, y_test_int = int_prepared_data["test"]["features"], int_prepared_data["test"]["labels"]
X_dev_int, y_dev_int = int_prepared_data["dev"]["features"], int_prepared_data["dev"]["labels"]

In [156]:
mlp = MLP(sequence_length=len(X_train_int[0]))



In [157]:
metrics = classification_metrics("macro")
mlp.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=metrics
    )

mlp.fit(X_train_int, y_train_int, epochs=40, validation_data=(X_dev_int, y_dev_int))

Epoch 1/40
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 19ms/step - binary_accuracy: 0.9543 - f1__macro: 0.0526 - loss: 0.1723 - precision: 0.3737 - recall: 0.0610 - val_binary_accuracy: 0.9642 - val_f1__macro: 0.1743 - val_loss: 0.1164 - val_precision: 0.6366 - val_recall: 0.3426
Epoch 2/40
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 19ms/step - binary_accuracy: 0.9672 - f1__macro: 0.2477 - loss: 0.1051 - precision: 0.7223 - recall: 0.3571 - val_binary_accuracy: 0.9650 - val_f1__macro: 0.2914 - val_loss: 0.1098 - val_precision: 0.6715 - val_recall: 0.3274
Epoch 3/40
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 18ms/step - binary_accuracy: 0.9716 - f1__macro: 0.3946 - loss: 0.0846 - precision: 0.7709 - recall: 0.4648 - val_binary_accuracy: 0.9644 - val_f1__macro: 0.3297 - val_loss: 0.1122 - val_precision: 0.6261 - val_recall: 0.3806
Epoch 4/40
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 18

<keras.src.callbacks.history.History at 0x3146869d0>

# Evaluation

In [161]:
y_dev_pred = clf.predict(X_dev)

In [158]:
mlp.evaluate(X_test_int, y_test_int)

[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - binary_accuracy: 0.9457 - f1__macro: 0.2639 - loss: 0.4909 - precision: 0.3510 - recall: 0.3573


[0.4936329424381256,
 0.265544056892395,
 0.9453725814819336,
 0.347980260848999,
 0.3566124141216278]

In [162]:
print(classification_report(y_dev, y_dev_pred))

              precision    recall  f1-score   support

           0       0.72      0.37      0.49       488
           1       0.79      0.41      0.54       303
           2       0.60      0.19      0.29       195
           3       0.17      0.01      0.01       303
           4       0.65      0.06      0.10       397
           5       0.50      0.01      0.03       153
           6       0.71      0.08      0.14       152
           7       0.59      0.09      0.16       248
           8       0.76      0.29      0.42        77
           9       0.50      0.01      0.01       163
          10       0.20      0.01      0.02       292
          11       0.44      0.12      0.19        97
          12       0.50      0.03      0.05        35
          13       0.28      0.05      0.09        96
          14       0.73      0.12      0.21        90
          15       0.97      0.78      0.86       358
          16       0.00      0.00      0.00        13
          17       0.59    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Make prediction

In [3]:
import mlflow
import pandas as pd
import s3fs
import os

In [5]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
model = mlflow.pyfunc.load_model(os.getenv("MLFLOW_REGISTERED_MODEL"))
s3 = s3fs.S3FileSystem(
    anon=False, 
    key=os.getenv("MINIO_ACCESS_KEY"), 
    secret=os.getenv("MINIO_SECRET_KEY"), 
    endpoint_url=os.getenv("MINIO_ENDPOINT_URL")
)

with s3.open(f's3://emotiai/goemotion/vocabulary.parquet', 'rb') as f:
    df = pd.read_parquet(f, engine="pyarrow")

# Stack all tensors into a single tensor (if they have the same shape)
vocabulary = df["vocabulary"]

MlflowException: API request to http://172.18.0.1:5000/api/2.0/mlflow/model-versions/get-download-uri failed with timeout exception HTTPConnectionPool(host='172.18.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow/model-versions/get-download-uri?name=goemotion_MLP&version=8 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x12dc83250>, 'Connection to 172.18.0.1 timed out. (connect timeout=120)')). To increase the timeout, set the environment variable MLFLOW_HTTP_REQUEST_TIMEOUT (default: 120, type: int) to a larger value.