# Initializtion

In [1]:
# Import necessary libraries
import pandas as pd
import torch

from transformers import BertTokenizer, BertForSequenceClassification

# Read in the data
df = pd.read_csv('C://Offline_Storage//allTitles_forBERT.csv')

# Optional: Data cleaning
df['organizationalPerson.title'] = df['organizationalPerson.title'].str.lower().str.replace('[^a-z0-9\s]', '')

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_pt=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

non_string_rows = df[df['organizationalPerson.title'].apply(lambda x: not isinstance(x, str))]
print(non_string_rows)

print(df[df['organizationalPerson.title'].isna()])

df = df.dropna(subset=['organizationalPerson.title'])

2.0.1+cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Empty DataFrame
Columns: [client, organizationalPerson.title, class, class_title]
Index: []
Empty DataFrame
Columns: [client, organizationalPerson.title, class, class_title]
Index: []


In [2]:
print("Assessing 'class' value type(s)")

print(df['class'].dtype)

Assessing 'class' value type(s)
int64


In [3]:
# Tokenize the titles (this is just a demonstration; there are more steps involved like attention masks, padding, etc.)
input_ids = tokenizer(df['organizationalPerson.title'].tolist(), padding=True, truncation=True, return_tensors="pt")

labels = torch.tensor(df['class'].values)

In [4]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32  # You can adjust this value based on your computer's capabilities

# Create a TensorDataset
data = TensorDataset(input_ids['input_ids'], input_ids['attention_mask'], labels)

# Use RandomSampler for training data and SequentialSampler for validation (if you split your data)
dataloader = DataLoader(data, sampler=RandomSampler(data), batch_size=batch_size)


In [6]:
from torch.optim import AdamW

# Define the optimizer. AdamW is the Adam optimizer with weight decay fix.
optimizer = AdamW(model.parameters(), lr=2e-5)  # You can adjust the learning rate


In [7]:
# Function to print current activity and time
from datetime import datetime

def print_activity(activity):
    current_time = datetime.now().strftime('%I:%M %p') # This will give you time in the format '01:37 PM'
    print(f"Starting {activity} at {current_time}...")



# Load and Set to Evaluation Mode

In [14]:
# Load the model
model = torch.load("fine_tuned_model_entire.pth")
# Ensure the model is in evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Training Loop (~3 hrs on first pass) 

In [16]:
epochs = 3  # Number of training epochs. You can adjust this.
batchcount = 0

for epoch in range(epochs):

    current_looptime = datetime.now().strftime('%I:%M %p') # This will give you time in the format '01:37 PM'
    print(f"Starting Training Epoch #{epoch} at {current_looptime} <3")
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(dataloader):

        # Load batch to device
        batchcount += 1
        b_input_ids, b_input_mask, b_labels = batch
        print(f"Batch {batchcount} loaded!")
        
        # Clear out the gradients
        model.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

    print("Training Epoch Complete")
    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}")


Starting Training Epoch #0 at 03:08 PM <3
Batch 1 loaded!
Batch 2 loaded!
Batch 3 loaded!
Batch 4 loaded!
Batch 5 loaded!
Batch 6 loaded!
Batch 7 loaded!
Batch 8 loaded!
Batch 9 loaded!
Batch 10 loaded!
Batch 11 loaded!
Batch 12 loaded!
Batch 13 loaded!
Batch 14 loaded!
Batch 15 loaded!
Batch 16 loaded!
Batch 17 loaded!
Batch 18 loaded!
Batch 19 loaded!
Batch 20 loaded!
Batch 21 loaded!
Batch 22 loaded!
Batch 23 loaded!
Batch 24 loaded!
Batch 25 loaded!
Batch 26 loaded!
Batch 27 loaded!
Batch 28 loaded!
Batch 29 loaded!
Batch 30 loaded!
Batch 31 loaded!
Batch 32 loaded!
Batch 33 loaded!
Batch 34 loaded!
Batch 35 loaded!
Batch 36 loaded!
Batch 37 loaded!
Batch 38 loaded!
Batch 39 loaded!
Batch 40 loaded!
Batch 41 loaded!
Batch 42 loaded!
Batch 43 loaded!
Batch 44 loaded!
Batch 45 loaded!
Batch 46 loaded!
Batch 47 loaded!
Batch 48 loaded!
Batch 49 loaded!
Batch 50 loaded!
Batch 51 loaded!
Batch 52 loaded!
Batch 53 loaded!
Batch 54 loaded!
Batch 55 loaded!
Batch 56 loaded!
Batch 57 loaded

# Save the Model! :-)

In [17]:
torch.save(model, "fine_tuned_model_entire.pth")

# Fine-Tuning

In [20]:
label_dict = {
    'title': 0,
    'placeholder': 1,
    'mailbox': 2,
    'non-english': 3,
    'ownership': 4,
    'error': 5
}

class_to_label_mapping = {v: k for k, v in label_dict.items()}

new_titles = ["After Hours MB","A3B6C9D2","Technical Queries MB","Visual Designer II","Agricultural Consultant","Renewable Energy Consultant","Product Development Manager","Flight Operations Specialist","Demo Job Role","Blockchain Developer","Data Analytics Specialist","P3Q5R7S4","Content Marketing Associate","Client Relations Mailbox","F4G7H1J5","Technical Support Engineer","Trade Show Manager","K8L0M2N6","Forensic Accountant","Patient Care Coordinator","V8W1X0Y9","Title Not Listed","Undefined Position","Environmental Health Engineer","Senior UX/UI Designer","Social Media Analyst","Benefits Management Mailbox","Sales Support Mailbox"]

input_data = tokenizer(new_titles, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    predictions = model(**input_data)
predicted_classes = torch.argmax(predictions.logits, dim=1)


for title, class_idx in zip(new_titles, predicted_classes):
    print(f"Title: {title}")
    print(f"Predicted Class: {class_to_label_mapping[class_idx.item()]}")
    print("-" * 50)



Title: After Hours MB
Predicted Class: title
--------------------------------------------------
Title: A3B6C9D2
Predicted Class: error
--------------------------------------------------
Title: Technical Queries MB
Predicted Class: title
--------------------------------------------------
Title: Visual Designer II
Predicted Class: title
--------------------------------------------------
Title: Agricultural Consultant
Predicted Class: title
--------------------------------------------------
Title: Renewable Energy Consultant
Predicted Class: title
--------------------------------------------------
Title: Product Development Manager
Predicted Class: title
--------------------------------------------------
Title: Flight Operations Specialist
Predicted Class: title
--------------------------------------------------
Title: Demo Job Role
Predicted Class: placeholder
--------------------------------------------------
Title: Blockchain Developer
Predicted Class: title
---------------------------

In [25]:
import datetime
import torch.nn.functional as F
import random

dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=batch_size)

for i, (input_ids, attention_mask, true_labels) in enumerate(dataloader):
    print(f"Dataloader label for batch {i}: {true_labels}")
    print(f"Original dataframe label for batch {i}: {df['class'].values[i*batch_size: (i+1)*batch_size]}")
    if i > 150:  # Print for the first 150 batches as an example, adjust as needed.
        break


all_predictions = []
all_confidences = []
all_true_labels = df['class'].values  # Get true labels from the dataframe
titles = df['organizationalPerson.title'].values  # Get titles from the dataframe


start_time = datetime.datetime.now()
last_print_time = start_time

num_batches_processed = 0
print_interval_minutes = 1

# Randomly shuffle and pick 50 batches
dataloader_batches = list(dataloader)
#random.shuffle(dataloader_batches) # randomizer
#sampled_batches = dataloader_batches[:50] #sampler
sampled_batches = dataloader_batches

print("Batches NOT shuffled and ALL selected. Starting to run predictions -")

for batch in sampled_batches:
    input_ids, attention_mask, true_labels = batch
    
    with torch.no_grad():
        predictions = model(input_ids=input_ids, attention_mask=attention_mask)
    predicted_classes = torch.argmax(predictions.logits, dim=1)
    all_predictions.extend(predicted_classes)

    # Convert logits to probabilities
    probabilities = F.softmax(predictions.logits, dim=1)
    predicted_confidences = [prob[pred].item() for prob, pred in zip(probabilities, predicted_classes)]
    all_confidences.extend(predicted_confidences)
    
    num_batches_processed += 1
    current_time = datetime.datetime.now()
    elapsed_since_last_print = (current_time - last_print_time).total_seconds() / 60  # Convert to minutes

    if elapsed_since_last_print >= print_interval_minutes:
        print(f"It's now {current_time.strftime('%H:%M')}, {num_batches_processed} batches predicted.")
        last_print_time = current_time

outliers = [(i, titles[i], all_predictions[i], all_true_labels[i], all_confidences[i]) for i in range(len(all_predictions)) if all_predictions[i] != all_true_labels[i]]

# Display outliers
print("Predictions Complete. Displaying Outliers:")
for i, title, pred, true, conf in outliers:
    print(f"Index: {i}")
    print(f"Title: {title}")
    print(f"Predicted Class: {class_to_label_mapping[pred.item()]} with confidence: {conf*100:.2f}%")
    print(f"True Class: {class_to_label_mapping[true]}")
    print("-" * 50)


Dataloader label for batch 0: tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5])
Original dataframe label for batch 0: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
Dataloader label for batch 1: tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5])
Original dataframe label for batch 1: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
Dataloader label for batch 2: tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5])
Original dataframe label for batch 2: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
Dataloader label for batch 3: tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5])
Original dataframe label for batch 3: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
Dataloader l

In [18]:
label_dict = {
    'title': 0,
    'placeholder': 1,
    'mailbox': 2,
    'non-english': 3,
    'ownership': 4,
    'error': 5
}

class_to_label_mapping = {v: k for k, v in label_dict.items()}

new_titles = ["After Hours MB","A3B6C9D2","Technical Queries MB","Visual Designer II","Agricultural Consultant","Renewable Energy Consultant","Product Development Manager","Flight Operations Specialist","Demo Job Role","Blockchain Developer","Data Analytics Specialist","P3Q5R7S4","Content Marketing Associate","Client Relations Mailbox","F4G7H1J5","Technical Support Engineer","Trade Show Manager","K8L0M2N6","Forensic Accountant","Patient Care Coordinator","V8W1X0Y9","Title Not Listed","Undefined Position","Environmental Health Engineer","Senior UX/UI Designer","Social Media Analyst","Benefits Management Mailbox","Sales Support Mailbox"]

input_data = tokenizer(new_titles, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    predictions = model(**input_data)
predicted_classes = torch.argmax(predictions.logits, dim=1)


for title, class_idx in zip(new_titles, predicted_classes):
    print(f"Title: {title}")
    print(f"Predicted Class: {class_to_label_mapping[class_idx.item()]}")
    print("-" * 50)



Title: After Hours MB
Predicted Class: title
--------------------------------------------------
Title: A3B6C9D2
Predicted Class: error
--------------------------------------------------
Title: Technical Queries MB
Predicted Class: title
--------------------------------------------------
Title: Visual Designer II
Predicted Class: title
--------------------------------------------------
Title: Agricultural Consultant
Predicted Class: title
--------------------------------------------------
Title: Renewable Energy Consultant
Predicted Class: title
--------------------------------------------------
Title: Product Development Manager
Predicted Class: title
--------------------------------------------------
Title: Flight Operations Specialist
Predicted Class: title
--------------------------------------------------
Title: Demo Job Role
Predicted Class: placeholder
--------------------------------------------------
Title: Blockchain Developer
Predicted Class: title
---------------------------