In [2]:
import fasttext
import pandas as pd

In [3]:
df = pd.read_csv('customer_support_tickets.csv')
df['fasttext_data'] = df['Ticket Type'] + ' ' + df['Ticket Subject'] + ' ' + df['Ticket Description'] + ' __label__' + df['Ticket Priority']
df['fasttext_data'].to_csv('fasttext_data.txt', index=False, header=False)

In [4]:
model = fasttext.train_supervised('fasttext_data.txt')

Read 0M words
Number of words:  14979
Number of labels: 8
Progress: 100.0% words/sec/thread: 3283868 lr:  0.000000 avg.loss:  1.518065 ETA:   0h 0m 0s


In [5]:
model.save_model("model.bin")

In [6]:
model = fasttext.load_model("model.bin")

In [7]:
x = input("Enter a ticket subject: ")
priority = model.predict(x)

In [8]:
priority[0][0].replace("__label__", "").replace("\"", "")

'Critical'

## FASTTEXT NOT ON WINDOWS
this means i need to pivot to another model. 
i was reading up on hugginface 

In [9]:
# preparing the data for the model


new_df = pd.DataFrame()

new_df['text'] = df['Ticket Type'] + " " + df['Ticket Subject'] + " " + df['Ticket Description']
new_df.head()

Unnamed: 0,text
0,Technical issue Product setup I'm having an is...
1,Technical issue Peripheral compatibility I'm h...
2,Technical issue Network problem I'm facing a p...
3,Billing inquiry Account access I'm having an i...
4,Billing inquiry Data loss I'm having an issue ...


In [10]:
import numpy as np

# Define the conditions
conditions = [
    (df['Ticket Priority'] == 'Low'),
    (df['Ticket Priority'] == 'Medium'),
    (df['Ticket Priority'] == 'High'),
    (df['Ticket Priority'] == 'Critical')
]

# Define the corresponding values
values = [0, 1, 2, 3]

# Create the new column
new_df['label'] = np.select(conditions, values)

In [11]:
new_df.head()

Unnamed: 0,text,label
0,Technical issue Product setup I'm having an is...,3
1,Technical issue Peripheral compatibility I'm h...,3
2,Technical issue Network problem I'm facing a p...,0
3,Billing inquiry Account access I'm having an i...,0
4,Billing inquiry Data loss I'm having an issue ...,0


In [42]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

In [44]:
# Convert the labels to torch tensors
labels = torch.tensor(new_df['label'].values)

# Tokenize the text data
input_ids = []
attention_masks = []

In [46]:
for text in new_df['text']:
    encoded_dict = tokenizer.encode_plus(
        text,                      # Sentence to encode
        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
        max_length = 64,           # Pad & truncate all sentences
        pad_to_max_length = True,
        return_attention_mask = True,   # Construct attention masks
        return_tensors = 'pt',     # Return pytorch tensors
    )

    # Add the encoded sentence to the list
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding)
    attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [47]:
# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [48]:
# Split the data into training and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2023, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=2023, test_size=0.1)

In [49]:
# Convert all inputs and labels into torch tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

  train_inputs = torch.tensor(train_inputs)
  validation_inputs = torch.tensor(validation_inputs)
  train_labels = torch.tensor(train_labels)
  validation_labels = torch.tensor(validation_labels)
  train_masks = torch.tensor(train_masks)
  validation_masks = torch.tensor(validation_masks)


In [50]:
# The DataLoader needs to know our batch size for training, so we specify it here
batch_size = 32

In [51]:
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [52]:
# Create the DataLoader for our validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [53]:
# code taken from the blog: https://towardsdatascience.com/fine-tuning-bert-for-text-classification-54e7df642894

from transformers import AdamW, get_linear_schedule_with_warmup

# Load the pre-trained model for classification, BERT base uncased
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 4, # The number of output labels--2 for binary classification
    output_attentions = False, # Whether the model returns attentions weights
    output_hidden_states = False, # Whether the model returns all hidden-states
)

# Tell pytorch to run this model on the GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Set up the optimizer and the learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 4
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

# Define the accuracy measurement function
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Begin training
for epoch_i in range(0, epochs):
    
    # Training
    total_train_loss = 0
    model.train()
    
    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        
        model.zero_grad()        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        
        loss = outputs[0]
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    print("Average training loss: {0:.2f}".format(avg_train_loss))

    # Validation
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        
        with torch.no_grad():        
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)
            
        loss = outputs[0]
        logits = outputs[1]
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    print("Validation Loss: {0:.2f}".format(avg_val_loss))


Downloading model.safetensors: 100%|██████████| 440M/440M [00:26<00:00, 16.8MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification 

KeyboardInterrupt: 

In [56]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch

# Load DistilBertTokenizer and DistilBertForSequenceClassification, set the device to GPU (if available)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

for text in new_df['text']:
    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = 64,          # Truncate all sentences (reduce the sequence length)
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt', 
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(new_df['label'].values)


# Split the dataset into train and validation datasets
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


# Create the DataLoaders
batch_size = 32

train_dataloader = DataLoader(
            train_dataset, 
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
        )
        



Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 530kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 155kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 994kB/s]
Downloading model.safetensors: 100%|██████████| 268M/268M [00:14<00:00, 17.9MB/s] 
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly 

In [55]:
from transformers import AdamW
from tqdm import tqdm

# Move the model to the GPU
model = model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # Learning Rate - Default is 5e-5
                  eps = 1e-8 # Adam Epsilon  - Default is 1e-8.
                )

# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs]
total_steps = len(train_dataloader) * epochs

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # Training

    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        
        loss = outputs[0]

        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # Validation

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        loss = outputs[0]

        total_eval_loss += loss.item()

        logits = outputs[1]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")



(8469, 2)

In [14]:
new_df.to_csv('ticketClassification.csv')