# Overview

We will use Bert 3 to develop the model.

Requirements:

- Output file from 3-merge-data.ipynb

# Install Dependencies

Our environment will need several ML packages required to import.

## PIP Packages (Optional)

In [1]:
!pip install protobuf==3.20.*

#!pip uninstall tensorflow -y
#!pip install tensorflow

!pip install pandas transformers torch scikit-learn tf-keras transformers

[0m

## Required Packages

In [3]:
!pip install pandas
import pandas as pd
import json

# Read config.json
with open('../config.json', 'r') as config_file:
    config = json.load(config_file)

[0m

ModuleNotFoundError: No module named 'pandas'

# Hyper Parameters

In [None]:
# Load the data
SAVE_DIRECTORY = {config['save_directory']}


# Load and Prepare Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv(f"{config['save_directory']}/output/{config['model_version']}-3-merge-data.csv")

# Ensure all values in 'singleMessage' column are strings
df['singleMessage'] = df['singleMessage'].astype(str)

# Encode the labels
label_encoder = LabelEncoder()
df['reason_encoded'] = label_encoder.fit_transform(df['reason'])

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['singleMessage'].values, df['reason_encoded'].values, test_size=0.2, random_state=42
)


# Step 2: Tokenize the Data

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Convert numpy arrays to lists
train_texts = train_texts.tolist()
val_texts = val_texts.tolist()

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


# Create a Dataset Class

In [None]:
import torch

class MessageDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MessageDataset(train_encodings, train_labels)
val_dataset = MessageDataset(val_encodings, val_labels)


In [None]:
print(torch.cuda.is_available())

# Fine-Tune the BERT Model

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir=f'{SAVE_DIRECTORY}/output/bert3',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{SAVE_DIRECTORY}/output/bert3/logs',
    logging_steps=10,
    evaluation_strategy="epoch"
    #eval_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


# Save the Model and Tokenizer

In [None]:
import pickle

# Directory
MODEL_DIRECTORY = f'{SAVE_DIRECTORY}/models/bert3'

# Save the model
model.save_pretrained(MODEL_DIRECTORY)
tokenizer.save_pretrained(MODEL_DIRECTORY)

# Save the label encoder
with open(f'{MODEL_DIRECTORY}/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
