# Initializtion

In [1]:
# Import necessary libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Read in the data
df = pd.read_csv('C://Offline_Storage//radiantClass//All_DOC-and-DMT_data.csv')

# Optional: Data cleaning
# Retain commas, and convert to lowercase
df['SOC18_DMT'] = df['SOC18_DMT'].str.lower().str.replace('[^a-z0-9,\s]', '')

# Instantiate the encoder
encoder = LabelEncoder()

# Fit and transform the labels to produce integer encoded labels
encoded_labels = encoder.fit_transform(df['SOC18_DOC'].values)

# Convert to PyTorch tensor
labels = torch.tensor(encoded_labels)


# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_pt=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=864)

# Check for non-string rows
non_string_rows = df[df['SOC18_DMT'].apply(lambda x: not isinstance(x, str))]
print(non_string_rows)

# Check for NaN in 'SOC18_DMT' column
print(df[df['SOC18_DMT'].isna()])

# Drop NaN rows
df = df.dropna(subset=['SOC18_DMT'])

# Splitting the dataset 80-10-10
train, temp = train_test_split(df, test_size=0.20, random_state=42)
valid, test = train_test_split(temp, test_size=0.50, random_state=42)

# Split data without stratifying
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['SOC18_DMT'].tolist(), encoded_labels, test_size=0.2
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5
)

# Tokenize the data for each split
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

# Convert labels for each split to torch tensors
train_labels = torch.tensor(train_labels).long()
val_labels = torch.tensor(val_labels).long()
test_labels = torch.tensor(test_labels).long()


print(f"Train size: {len(train_labels)}")
print(f"Validation size: {len(val_labels)}")
print(f"Test size: {len(test_labels)}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Empty DataFrame
Columns: [SOC18_DOC, SOC18_DMT]
Index: []
Empty DataFrame
Columns: [SOC18_DOC, SOC18_DMT]
Index: []
Train size: 5216
Validation size: 652
Test size: 652


In [2]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32  # You can adjust this value based on your computer's capabilities

# Create TensorDatasets
train_data = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_data = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_data = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Create DataLoaders
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
val_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data), batch_size=batch_size)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)



In [3]:
from torch.optim import AdamW

# Define the optimizer. AdamW is the Adam optimizer with weight decay fix.
optimizer = AdamW(model.parameters(), lr=2e-5)  # You can adjust the learning rate


In [4]:
# Function to print current activity and time
from datetime import datetime

def print_activity(activity):
    current_time = datetime.now().strftime('%I:%M %p') # This will give you time in the format '01:37 PM'
    print(f"Starting {activity} at {current_time}...")


# Load and Set to Evaluation Mode

In [None]:
# Load the model
model = torch.load("fine_tuned_model_entire.pth")
# Ensure the model is in evaluation mode
model.eval()

# Verify that CUDA is available

In [5]:
print(f"Is Cuda available?: {torch.cuda.is_available()}! :-)")
print(f"CUDA version: {torch.version.cuda}")
# print(f"PyTorch version: {torch.version}")

Is Cuda available?: True! :-)
CUDA version: 11.8


In [6]:
print(f"Is Cuda available?: {torch.cuda.is_available()}! :-)")

Is Cuda available?: True! :-)


# Training Loop (~24 min on first pass) 

In [20]:
epochs = 3  # Number of training epochs. You can adjust this.
batchcount = 0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):

    current_loopTime = datetime.now().strftime('%I:%M %p') # This will give you time in the format '01:37 PM'
    print(f"Starting Training Epoch #{epoch} at {current_loopTime} <3")
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(train_dataloader):

        # Load batch to device
        batchcount += 1
        b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        current_batchTime = datetime.now().strftime('%I:%M %p')
        if batchcount % 5 == 0:
            print(f"Batch {batchcount} loaded at {current_batchTime}! :-)")
        
        # Clear out the gradients
        model.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

    print("Training Epoch Complete")
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}")
    
    # Start validation loop
    model.eval()  # Set the model to evaluation mode

    val_total_loss = 0

    for batch in val_dataloader:
        # Move data to the device
        b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)

        with torch.no_grad():  # Don't compute gradients for validation
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            val_loss = outputs.loss
            val_total_loss += val_loss.item()

    val_avg_loss = val_total_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_avg_loss:.4f}")


Starting Training Epoch #0 at 04:22 PM <3
Batch 5 loaded at 04:22 PM! :-)
Batch 10 loaded at 04:22 PM! :-)
Batch 15 loaded at 04:22 PM! :-)
Batch 20 loaded at 04:22 PM! :-)
Batch 25 loaded at 04:22 PM! :-)
Batch 30 loaded at 04:22 PM! :-)
Batch 35 loaded at 04:22 PM! :-)
Batch 40 loaded at 04:22 PM! :-)
Batch 45 loaded at 04:22 PM! :-)
Batch 50 loaded at 04:22 PM! :-)
Batch 55 loaded at 04:22 PM! :-)
Batch 60 loaded at 04:22 PM! :-)
Batch 65 loaded at 04:22 PM! :-)
Batch 70 loaded at 04:22 PM! :-)
Batch 75 loaded at 04:22 PM! :-)
Batch 80 loaded at 04:22 PM! :-)
Batch 85 loaded at 04:22 PM! :-)
Batch 90 loaded at 04:22 PM! :-)
Batch 95 loaded at 04:22 PM! :-)
Batch 100 loaded at 04:22 PM! :-)
Batch 105 loaded at 04:22 PM! :-)
Batch 110 loaded at 04:22 PM! :-)
Batch 115 loaded at 04:22 PM! :-)
Batch 120 loaded at 04:22 PM! :-)
Batch 125 loaded at 04:22 PM! :-)
Batch 130 loaded at 04:22 PM! :-)
Batch 135 loaded at 04:22 PM! :-)
Batch 140 loaded at 04:22 PM! :-)
Batch 145 loaded at 04:22 

# Save the Model! :-)

In [21]:
torch.save(model, "GPU_v1_9-7-23-430pm.pth")

# Fine-Tuning

In [None]:
label_dict = {
    'title': 0,
    'placeholder': 1,
    'mailbox': 2,
    'non-english': 3,
    'ownership': 4,
    'error': 5
}

class_to_label_mapping = {v: k for k, v in label_dict.items()}

new_titles = ["After Hours MB","A3B6C9D2","Technical Queries MB","Visual Designer II","Agricultural Consultant","Renewable Energy Consultant","Product Development Manager","Flight Operations Specialist","Demo Job Role","Blockchain Developer","Data Analytics Specialist","P3Q5R7S4","Content Marketing Associate","Client Relations Mailbox","F4G7H1J5","Technical Support Engineer","Trade Show Manager","K8L0M2N6","Forensic Accountant","Patient Care Coordinator","V8W1X0Y9","Title Not Listed","Undefined Position","Environmental Health Engineer","Senior UX/UI Designer","Social Media Analyst","Benefits Management Mailbox","Sales Support Mailbox"]

input_data = tokenizer(new_titles, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    predictions = model(**input_data)
predicted_classes = torch.argmax(predictions.logits, dim=1)


for title, class_idx in zip(new_titles, predicted_classes):
    print(f"Title: {title}")
    print(f"Predicted Class: {class_to_label_mapping[class_idx.item()]}")
    print("-" * 50)



In [None]:
import datetime
import torch.nn.functional as F
import random

dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=batch_size)

for i, (input_ids, attention_mask, true_labels) in enumerate(dataloader):
    print(f"Dataloader label for batch {i}: {true_labels}")
    print(f"Original dataframe label for batch {i}: {df['class'].values[i*batch_size: (i+1)*batch_size]}")
    if i > 150:  # Print for the first 150 batches as an example, adjust as needed.
        break


all_predictions = []
all_confidences = []
all_true_labels = df['class'].values  # Get true labels from the dataframe
titles = df['organizationalPerson.title'].values  # Get titles from the dataframe


start_time = datetime.datetime.now()
last_print_time = start_time

num_batches_processed = 0
print_interval_minutes = 1

# Randomly shuffle and pick 50 batches
dataloader_batches = list(dataloader)
#random.shuffle(dataloader_batches) # randomizer
#sampled_batches = dataloader_batches[:50] #sampler
sampled_batches = dataloader_batches

print("Batches NOT shuffled and ALL selected. Starting to run predictions -")

for batch in sampled_batches:
    input_ids, attention_mask, true_labels = batch
    
    with torch.no_grad():
        predictions = model(input_ids=input_ids, attention_mask=attention_mask)
    predicted_classes = torch.argmax(predictions.logits, dim=1)
    all_predictions.extend(predicted_classes)

    # Convert logits to probabilities
    probabilities = F.softmax(predictions.logits, dim=1)
    predicted_confidences = [prob[pred].item() for prob, pred in zip(probabilities, predicted_classes)]
    all_confidences.extend(predicted_confidences)
    
    num_batches_processed += 1
    current_time = datetime.datetime.now()
    elapsed_since_last_print = (current_time - last_print_time).total_seconds() / 60  # Convert to minutes

    if elapsed_since_last_print >= print_interval_minutes:
        print(f"It's now {current_time.strftime('%H:%M')}, {num_batches_processed} batches predicted.")
        last_print_time = current_time

outliers = [(i, titles[i], all_predictions[i], all_true_labels[i], all_confidences[i]) for i in range(len(all_predictions)) if all_predictions[i] != all_true_labels[i]]

# Display outliers
print("Predictions Complete. Displaying Outliers:")
for i, title, pred, true, conf in outliers:
    print(f"Index: {i}")
    print(f"Title: {title}")
    print(f"Predicted Class: {class_to_label_mapping[pred.item()]} with confidence: {conf*100:.2f}%")
    print(f"True Class: {class_to_label_mapping[true]}")
    print("-" * 50)


In [None]:
label_dict = {
    'title': 0,
    'placeholder': 1,
    'mailbox': 2,
    'non-english': 3,
    'ownership': 4,
    'error': 5
}

class_to_label_mapping = {v: k for k, v in label_dict.items()}

new_titles = ["After Hours MB","A3B6C9D2","Technical Queries MB","Visual Designer II","Agricultural Consultant","Renewable Energy Consultant","Product Development Manager","Flight Operations Specialist","Demo Job Role","Blockchain Developer","Data Analytics Specialist","P3Q5R7S4","Content Marketing Associate","Client Relations Mailbox","F4G7H1J5","Technical Support Engineer","Trade Show Manager","K8L0M2N6","Forensic Accountant","Patient Care Coordinator","V8W1X0Y9","Title Not Listed","Undefined Position","Environmental Health Engineer","Senior UX/UI Designer","Social Media Analyst","Benefits Management Mailbox","Sales Support Mailbox"]

input_data = tokenizer(new_titles, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    predictions = model(**input_data)
predicted_classes = torch.argmax(predictions.logits, dim=1)


for title, class_idx in zip(new_titles, predicted_classes):
    print(f"Title: {title}")
    print(f"Predicted Class: {class_to_label_mapping[class_idx.item()]}")
    print("-" * 50)

