# Overview

We will use Bert 3 to develop the model.

Requirements:

- Output file from 3-merge-data.ipynb

# Install Dependencies

Our environment will need several ML packages required to import.

## PIP Packages (Optional)

In [1]:
pip install tensorflow transformers torch scikit-learn tf-keras transformers[torch]

Note: you may need to restart the kernel to use updated packages.


## Required Packages

In [None]:
import pandas as pd

# Hyper Parameters

In [None]:
# Load the data
#df = pd.read_csv('data/output/3-merge-data.csv')


# Load and Prepare Data

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('data/output/3-merge-data.csv')

# Ensure all values in 'singleMessage' column are strings
df['singleMessage'] = df['singleMessage'].astype(str)

# Encode the labels
label_encoder = LabelEncoder()
df['reason_encoded'] = label_encoder.fit_transform(df['reason'])

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['singleMessage'].values, df['reason_encoded'].values, test_size=0.2, random_state=42
)


# Step 2: Tokenize the Data

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Convert numpy arrays to lists
train_texts = train_texts.tolist()
val_texts = val_texts.tolist()

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


# Create a Dataset Class

In [4]:
import torch

class MessageDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MessageDataset(train_encodings, train_labels)
val_dataset = MessageDataset(val_encodings, val_labels)


# Fine-Tune the BERT Model

In [5]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir='./output/bert3',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./output/bert3/logs',
    logging_steps=10,
    #evaluation_strategy="epoch"
    eval_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


2024-07-17 17:53:40.070949: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-17 17:53:40.090219: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-17 17:53:40.095062: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-17 17:53:40.108046: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForSequenceClassification were no

Epoch,Training Loss,Validation Loss
1,0.0778,0.166572
2,0.0791,0.150667
3,0.0404,0.181121


TrainOutput(global_step=19692, training_loss=0.13316947230866077, metrics={'train_runtime': 308132.6558, 'train_samples_per_second': 1.022, 'train_steps_per_second': 0.064, 'total_flos': 2.072969918673869e+16, 'train_loss': 0.13316947230866077, 'epoch': 3.0})

# Save the Model and Tokenizer

In [8]:
import pickle

# Directory
save_directory = './models/bert3'

# Save the model
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Save the label encoder
with open(f'{save_directory}/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
