#### Imports

In [2]:
import os
import torch
import warnings
import pandas as pd
from transformers import AutoTokenizer
from utils.utils import *
from utils.data import TicketDataset, to_dataloader
from utils.models import ConvNet, EncoderTransformer
from utils.model_utils import get_class_weights

warnings.simplefilter(action='ignore', category=FutureWarning)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
support_tickets_file = '../data/preprocessed_labeled_complaints.pkl'
save_models = True

#### Loading the Preprocessed Data
***
* For simplicity, I preprocessed the data using my own implementations of the methods used by the author of the dataset and saved the results into an new file so that the data is ready to use.
* This dataset is pulled from Kaggle, where users upload **open-source** datasets for almost any topic.
* The author of the Automatic Ticket Classification Dataset: https://www.kaggle.com/venkatasubramanian

In [3]:
tickets = pd.read_pickle(support_tickets_file)
tickets.sample(5)

Unnamed: 0,complaint,complaint_lemma,complaint_nouns,topic,label
26063,i booked a rental car to be picked up in and...,I book a rental car to be pick up in and re...,book car pay ink business credit fee advance f...,Theft / Dispute reporting,3
21143,and submit these letters of negotiation on s...,and submit these letter of negotiation on s...,letter negotiation credit card consumer protec...,Credit card / Prepaid card,1
3745,my account was closed for no reason i am a vic...,my account be close for no reason I be a victi...,account reason victim bank discrimination prac...,Bank account services,0
22953,i was on a lunch break and went through a driv...,I be on a lunch break and go through a drive t...,lunch break drive thru card online banking acc...,Credit card / Prepaid card,1
40413,the ssi deposited money by error in my chase b...,the ssi deposit money by error in my chase ban...,deposit money error bank account money pende b...,Bank account services,0


In [4]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = TicketDataset(tickets, tokenizer)
trainset, testset = to_dataloader(dataset, batch_size=16, split=0.8)

In [6]:
tickets[tickets.topic == 'Theft / Dispute reporting'].complaint.values

array(['on   i made a  payment to an online retailer using chase quick pay with  on the chase website i realized that this was a scam website after no confirmation product or response to any inquiries regarding the order and contacted chase to get more information about the transaction to attempt getting a refund through the retailers bank \n\ni contacted chase via a secured message on  explaining what happened and asked  is there a way to reverse this transaction or do you have a contact at  that can give me more information about the recipient  that  my message was reviewed and i received a response restating my original email and informing me that  the transfer has been completed however as you mentioned that the website is a scam while we can handle most inquiries by email some require the expertise of another team in order to assist you better in regards to the refund we request you to call our consumer online technical support team  i called the number listed in the email and exp

#### Initializing Models & Hyper-Parameters
***
* **EncoderTransformer** is my implementation of a transformer ticket classifier.
* **ConvNet** is my approximation of the neural network described in the paper: *Hyperparameter Black-Box Optimization to Improve the Automatic Classification of Support Tickets*, which I am trying to improve upon with my EncoderTranformer.

##### Hyper-Parameters

In [4]:
epochs = 5
block_size = 200
num_filters = 512
embedding_dim = 300
num_ticket_classes = 5
filter_sizes = [5, 4, 3]
vocabulary_size = len(dataset.tokenizer.vocab)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
class_weights = get_class_weights(torch.tensor(tickets['label'].tolist()), num_ticket_classes, mode=2)

##### Models

In [5]:
# transformer = EncoderTransformer(
#     vocabulary_size, embedding_dim, block_size, num_ticket_classes
# ).to(device)
# cnn = ConvNet(
#     vocabulary_size, embedding_dim, num_filters, filter_sizes, num_ticket_classes
# ).to(device)

#### Train & Evaluate the Models

In [44]:
import torch.nn as nn
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

# Custom PyTorch estimator for scikit-learn
class PyTorchEstimator:
    def __init__(self, vocab_size=None, lr=1e-2, embedding_dim=300, activation_conv='relu', activation_dense='softmax', mode=2, num_filters=512, filter_sizes=[3,4,5], optimizer_func='adam', batch_size=16, num_epochs=5, num_classes=5, device=torch.device('cpu')):
        self.vocab_size = vocab_size
        self.lr = lr
        self.embedding_dim = embedding_dim
        self.activation_conv = activation_conv
        self.activation_dense = activation_dense
        self.optimizer_func = optimizer_func
        self.num_filters = num_filters
        self.filter_sizes = filter_sizes
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.device = device
        self.mode = mode
        self.num_classes = num_classes
    
    def set_vocab_size(self, vocab_size):
        self.vocab_size = vocab_size
        self.model, self.optimizer = self.build_cnn()
    
    def get_params(self, deep=True):
        # Return a dictionary of the estimator's parameters
        return {
            'vocab_size': self.vocab_size,
            'lr': self.lr,
            'embedding_dim': self.embedding_dim,
            'activation_conv': self.activation_conv,
            'activation_dense': self.activation_dense,
            'mode': self.mode,
            'num_filters': self.num_filters,
            'filter_sizes': self.filter_sizes,
            'optimizer_func': self.optimizer_func,
            'batch_size': self.batch_size,
            'num_epochs': self.num_epochs,
            'num_classes': self.num_classes,
            'device': self.device
        }
    
    def set_params(self, **params):
        # Update the estimator's parameters with the given values
        for key, value in params.items():
            setattr(self, key, value)
        # Rebuild the model, optimizer, and class weights based on the new parameters
        self.model, self.optimizer = self.build_cnn()
        return self
    
    def build_cnn(self):
        if self.vocab_size is None:
            raise ValueError("Vocabulary size is None.")
        
        activation_map = {
            "relu": nn.ReLU,
            "tanh": nn.Tanh,
            "sigmoid": nn.Sigmoid,
            "elu": nn.ELU,
            "softsign": nn.Softsign,
            "softmax": nn.Softmax
        }
        optimizer_map = {
            "adam": torch.optim.Adam, 
            "adamax": torch.optim.Adamax, 
            "rmsprop": torch.optim.RMSprop, 
            "sgd": torch.optim.SGD
        }
        conv_activation_layer = activation_map[self.activation_conv]()
        dense_activation_layer = activation_map[self.activation_dense]()
        model = ConvNet(
            self.vocab_size, 
            self.embedding_dim, 
            self.num_filters, 
            self.filter_sizes, 
            self.num_classes,
            conv_activation_layer=conv_activation_layer, 
            dense_activation_layer=dense_activation_layer
        ).to(self.device)
        optimizer = optimizer_map[self.optimizer_func](model.parameters(), lr=self.lr)

        return model, optimizer
    
    def compute_class_weights(self, data):
        return get_class_weights(
            torch.tensor(data['label'].tolist()), 
            self.num_classes, 
            mode=self.mode
        ).to(self.device)

    def fit(self, X, y):
        # Convert X and y to PyTorch tensors
        X_tensor = torch.tensor(X, dtype=torch.long, device=self.device)
        y_tensor = torch.tensor(y, dtype=torch.long, device=self.device)
        optimizer = self.optimizer
        class_weights = None

        if class_weights is None:
            criterion = nn.CrossEntropyLoss()
        else:
            criterion = nn.CrossEntropyLoss(weight=class_weights)

        for epoch in range(self.num_epochs):
            for input_ids, targets in zip(X_tensor, y_tensor):
                # input_ids = batch['input_ids'].to(self.device)
                # targets = batch['labels'].to(self.device)

                outputs = self.model(input_ids)
                labels = nn.functional.one_hot(targets, num_classes=5).type(torch.float32).to(device)

                loss = criterion(outputs, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
            if epoch % 10 == 0:
                print(f'Epoch: {epoch} - Loss: {loss}')

    def predict(self, testloader):
        self.model.eval()
        predictions = []
        with torch.no_grad():
            for inputs in testloader:
                inputs = inputs.to(self.device)
                outputs = self.model(inputs)
                _, preds = torch.max(outputs, 1)
                predictions.extend(preds.cpu().numpy())

        return predictions

    def score(self, testloader):
        y_true = []
        y_pred = self.predict(testloader)
        for _, targets in testloader:
            y_true.extend(targets.numpy())
        return accuracy_score(y_true, y_pred)

def custom_scorer(estimator, X, y=None):
    return estimator.score(X) if y is None else estimator.score(X, y)

In [42]:
def dataloader_to_numpy(train_loader):
    X = []
    y = []
    
    for batch in train_loader:
        X.append(batch['input_ids'].numpy())
        y.append(batch['labels'].numpy())
    
    X = np.concatenate(X)
    y = np.concatenate(y)
    
    return X, y

In [45]:
hyperparameters = {
    "lr": [1e-1, 1e-2, 1e-3],
    "embedding_dim": [200, 300, 400, 500],
    "activation_conv": ["relu", "elu", "tanh", "sigmoid", "softsign"],
    "activation_dense": ["relu", "softmax", "sigmoid"],
    "optimizer_func": ["adam", "adamax", "rmsprop", "sgd"],
    "filter_sizes": [[3,4,5], [5,4,3], [3,5,7], [7,5,3]],
    "mode": [1, 2, 3]
}

# Perform the halving grid search
estimator = PyTorchEstimator(device=device)
estimator.set_vocab_size(vocabulary_size)

X, y = dataloader_to_numpy(trainset)

halving_grid_search = HalvingGridSearchCV(
    estimator, 
    hyperparameters, 
    scoring=custom_scorer, 
    resource='num_epochs', 
    max_resources=5,
    aggressive_elimination=True
)
halving_grid_search.fit(X, y)

# Best hyperparameters
best_params = halving_grid_search.best_params_
print("Best hyperparameters:", best_params)

# Evaluate on test set
best_estimator = halving_grid_search.best_estimator_
test_accuracy = best_estimator.score(testset)
print("Test accuracy:", test_accuracy)


KeyboardInterrupt: 

#### Saving Models
***
If you wish to save the trained models, please uncomment the code below.