# Overview

We will use Recurrent Neural Network (LSTM) and softmax to have a list of label predictions.

Requirements:

- Output file from 3-merge-data.ipynb

# Install Dependencies

Our environment will need several ML packages required to import.

## PIP Packages (Optional)

In [2]:
pip install tensorflow numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


## Required Packages

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

2024-07-11 03:00:31.921972: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-11 03:00:31.941371: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-11 03:00:31.941436: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-11 03:00:31.970607: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Hyper Parameters

In [9]:
# Load the CSV
file_path = 'data/output/3-merge-data.csv'
df = pd.read_csv(file_path)

# Tokenize and pad the text data
max_len = 100  # Maximum length of input sequences
vocab_size = 10000  # Vocabulary size

# Training Settings
epochsCount = 4
epochsShuffleData = True

# Split Train and Test Data

In [5]:
# Handle NaN values
df = df.dropna(subset=['singleMessage'])

# Extract features and target
X = df['singleMessage']
y = df['reason']

# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len)

# Encode the target labels
label_encoder = LabelEncoder()
label_encoder.fit(y)  # Fit on the entire dataset

y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


num_classes = len(label_encoder.classes_)

y_train_categorical = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_categorical = to_categorical(y_test_encoded, num_classes=num_classes)


# Train

We will shuffle our data per each epoch.  We want a list of label probabilities so we will be using softmax activation.

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Create the RNN model
embedding_dim = 128

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train_categorical, epochs=epochsCount, validation_data=(X_test_padded, y_test_categorical), shuffle=epochsShuffleData)


Epoch 1/4




[1m2872/2872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 69ms/step - accuracy: 0.9002 - loss: 0.4427 - val_accuracy: 0.9481 - val_loss: 0.2017
Epoch 2/4
[1m2872/2872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 70ms/step - accuracy: 0.9583 - loss: 0.1571 - val_accuracy: 0.9528 - val_loss: 0.1561
Epoch 3/4
[1m2872/2872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 72ms/step - accuracy: 0.9696 - loss: 0.1047 - val_accuracy: 0.9530 - val_loss: 0.1610
Epoch 4/4
[1m2872/2872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 73ms/step - accuracy: 0.9732 - loss: 0.0910 - val_accuracy: 0.9508 - val_loss: 0.1741


<keras.src.callbacks.history.History at 0x7ff5ed4b4a30>

# Save Model


In [8]:
import pickle

# Save the trained model
model.save('models/labeler_model.keras')

# Save any other objects (like tokenizer, label encoder, etc.)
with open('objects/labeler_objects.pkl', 'wb') as f:
    pickle.dump((tokenizer, label_encoder), f)