## Import packages

In [61]:
import os
import pickle
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

## Load the file

In [62]:

directory = os.path.join(os.path.dirname(os.getcwd()), 'data', 'processed')
preprocessed_data_path = os.path.join(directory, 'preprocessed_data.pkl')

with open(preprocessed_data_path, 'rb') as file:
    preprocessed_data = pickle.load(file)

In [63]:


padded_sequences = preprocessed_data['padded_sequences']
input_shape = preprocessed_data['input_shape']
num_classes = preprocessed_data['num_classes']
vocab_size = preprocessed_data['vocab_size']
preprocessed_df = preprocessed_data['preprocessed_df']

In [64]:

# Create a new Tokenizer instance and fit it on the text data for both input and target labels
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_df['description'])  # Assuming 'description' is the column containing text data

In [65]:
# Split the data into training and testing sets consistently for both input and target labels
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    preprocessed_df['description'],  # Assuming 'description' is the column containing text data
    preprocessed_df['category'],  # Assuming 'category' is the column containing category labels
    test_size=0.2,
    random_state=42
)


In [66]:

# Convert the target labels to integer encoding
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Get the padded sequences for the training and testing data
train_sequences = pad_sequences(tokenizer.texts_to_sequences(train_sequences), maxlen=input_shape[0])
test_sequences = pad_sequences(tokenizer.texts_to_sequences(test_sequences), maxlen=input_shape[0])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [67]:
# Define the model with the correct output layer
model = Sequential([
    Embedding(vocab_size, 64, input_length=input_shape[0]),
    tf.keras.layers.GlobalAveragePooling1D(),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')  # Ensure the output layer matches the number of classes
])



In [68]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Ensure the number of samples in input data and target labels are consistent
assert len(train_sequences) == len(train_labels_encoded), "Number of samples in input data and target labels must be the same"

In [69]:

# Train the model
history = model.fit(
    train_sequences,
    train_labels_encoded,
    epochs=50,
    batch_size=32,
    validation_data=(test_sequences, test_labels_encoded),
    callbacks=[early_stopping],
    verbose=2
)


Epoch 1/50
3/3 - 2s - 526ms/step - accuracy: 0.3750 - loss: 1.3839 - val_accuracy: 1.0000 - val_loss: 1.3757
Epoch 2/50
3/3 - 0s - 22ms/step - accuracy: 1.0000 - loss: 1.3728 - val_accuracy: 1.0000 - val_loss: 1.3676
Epoch 3/50
3/3 - 0s - 22ms/step - accuracy: 1.0000 - loss: 1.3633 - val_accuracy: 1.0000 - val_loss: 1.3582
Epoch 4/50
3/3 - 0s - 22ms/step - accuracy: 1.0000 - loss: 1.3530 - val_accuracy: 1.0000 - val_loss: 1.3475
Epoch 5/50
3/3 - 0s - 25ms/step - accuracy: 1.0000 - loss: 1.3412 - val_accuracy: 1.0000 - val_loss: 1.3347
Epoch 6/50
3/3 - 0s - 24ms/step - accuracy: 1.0000 - loss: 1.3276 - val_accuracy: 1.0000 - val_loss: 1.3203
Epoch 7/50
3/3 - 0s - 21ms/step - accuracy: 1.0000 - loss: 1.3122 - val_accuracy: 1.0000 - val_loss: 1.3031
Epoch 8/50
3/3 - 0s - 24ms/step - accuracy: 1.0000 - loss: 1.2939 - val_accuracy: 1.0000 - val_loss: 1.2829
Epoch 9/50
3/3 - 0s - 21ms/step - accuracy: 1.0000 - loss: 1.2730 - val_accuracy: 1.0000 - val_loss: 1.2597
Epoch 10/50
3/3 - 0s - 23ms