In [None]:
# After training, save the MultiLabelBinarizer
import joblib
joblib.dump(mlb, "mlb.joblib")

['mlb.joblib']

Truncated coding

#
#

In [None]:
import shutil
shutil.make_archive('/content/session', 'zip', '/content')

In [None]:
import csv
import torch
import numpy as np
import joblib
import re
import os
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm
from torch.optim import AdamW
import matplotlib.pyplot as plt  # REQUIRED FOR PLOTTING
import random

def jaccard_similarity(y_true, y_pred, mlb_classes):
    scores = []
    for true_row, pred_row in zip(y_true, y_pred):
        true_skills = [mlb_classes[i] for i in np.where(true_row == 1)[0]]
        pred_skills = [mlb_classes[i] for i in np.where(pred_row == 1)[0]]
        overlap = []
        for t in true_skills:
            for p in pred_skills:
                tokens_t = set(re.findall(r'\w+', t.lower()))  # Tokenize and lowercase
                tokens_p = set(re.findall(r'\w+', p.lower()))  # Tokenize and lowercase
                if tokens_t or tokens_p:
                    overlap.append(len(tokens_t & tokens_p) / len(tokens_t | tokens_p))  # Jaccard-like overlap
        if overlap:
            scores.append(np.mean(overlap))  # Average overlap per true/pred skill pair
    return np.mean(scores) if scores else 0.0  # Return the mean overlap score


# Custom dataset class
class JobSkillDataset(Dataset):
    def __init__(self, summaries, labels, tokenizer, max_len=256):
        self.summaries = summaries
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.summaries)

    def __getitem__(self, idx):
        summary = str(self.summaries[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            summary,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# Load dataset from CSV using csv module
def load_data_from_csv(file_path):
    summaries = []
    skill_lists = []
    with open(file_path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if 'filtered_skills' in row and 'job_summary' in row:
                summaries.append(row['job_summary'])
                skill_lists.append(row['filtered_skills'].split(','))
    return summaries, skill_lists

# TRAIN FUNCTION WITH TRAINING AND VALIDATION METRICS TRACKING
def train_model(dataset, model, tokenizer, mlb, save_dir, epochs=3, batch_size=8, learning_rate=5e-5, threshold=0.3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # SPLIT DATASET INTO TRAIN, VALIDATION, AND TEST SETS
    total_size = len(dataset)
    train_size = int(0.8 * total_size)
    val_size = int(0.1 * total_size)
    test_size = total_size - train_size - val_size
    train_ds, val_ds, test_ds = random_split(dataset, [train_size, val_size, test_size])

    cpu_count = os.cpu_count() #use max num of CPU cores to load data
    train_loader = DataLoader(train_ds, batch_size=batch_size, num_workers=cpu_count-1, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, num_workers=cpu_count-1)
    test_loader = DataLoader(test_ds, batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # LISTS TO TRACK TRAINING LOSS, VALIDATION LOSS, AND VALIDATION ACCURACY
    train_losses = []
    val_losses = []
    val_accuracies = []
    val_jaccards = []  # Added to track Jaccard similarity

    os.makedirs(save_dir, exist_ok=True)
    log_file_path = os.path.join(save_dir, "validation_predictions.log")
    log_file = open(log_file_path, "w", encoding="utf-8")

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        print(f"\nEpoch {epoch+1}/{epochs}")
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        print(f"Average Training Loss: {avg_train_loss:.4f}")

        # EVALUATE ON VALIDATION SET
        model.eval()
        val_loss = 0
        correct_preds = 0
        total_preds = 0
        total_jaccard = 0  # INITIALIZE JACCARD SIMILARITY ACCUMULATOR
        all_preds = []
        all_true = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

                probs = torch.sigmoid(outputs.logits)
                preds = (probs > threshold).float()  # BINARIZE PREDICTIONS (THRESHOLD = 0.3)

                # Accumulate for metric computation
                all_preds.append(preds.cpu())
                all_true.append(labels.cpu())
                # CALCULATE EXACT MATCH FOR THIS BATCH
                correct_preds += torch.sum((preds == labels).all(dim=1)).item()
                total_preds += labels.size(0)

                # CALCULATE JACCARD SIMILARITY FOR THIS BATCH
                batch_jaccard = jaccard_similarity(labels.cpu().numpy(), preds.cpu().numpy(), mlb.classes_)
                total_jaccard += batch_jaccard * labels.size(0)  # WEIGHT JACCARD BY BATCH SIZE

                # Log a few examples to the file
                for i in range(min(3, len(labels))):  # Log only first 3 examples per batch to prevent overwhelming logs
                    expected_indices = labels[i].cpu().numpy().astype(int).nonzero()[0]
                    predicted_indices = preds[i].cpu().numpy().astype(int).nonzero()[0]

                    expected_skills = [mlb.classes_[idx] for idx in expected_indices]
                    predicted_skills = [mlb.classes_[idx] for idx in predicted_indices]

                    print(f"Expected Skills: {expected_skills}\n")
                    print(f"Predicted Skills: {predicted_skills}\n")
                    print(f"Jaccard Similarity: {batch_jaccard:.4f}\n")

                    print("-" * 80 + "\n")

        # Calculate and store validation metrics
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct_preds / total_preds
        avg_jaccard = total_jaccard / total_preds  # AVERAGE JACCARD SIMILARITY FOR THE VALIDATION SET

        # Store metrics for plotting
        val_losses.append(avg_val_loss)
        val_accuracies.append(val_accuracy)
        val_jaccards.append(avg_jaccard)

        # ////////////////
         # Compute metrics
        all_preds_tensor = torch.cat(all_preds).numpy()
        all_true_tensor = torch.cat(all_true).numpy()

        val_precision_micro = precision_score(all_true_tensor, all_preds_tensor, average='micro', zero_division=0)
        val_recall_micro = recall_score(all_true_tensor, all_preds_tensor, average='micro', zero_division=0)
        val_f1_micro = f1_score(all_true_tensor, all_preds_tensor, average='micro', zero_division=0)

        precision_macro = precision_score(all_true_tensor, all_preds_tensor, average='macro', zero_division=0)
        recall_macro = recall_score(all_true_tensor, all_preds_tensor, average='macro', zero_division=0)
        f1_macro = f1_score(all_true_tensor, all_preds_tensor, average='macro', zero_division=0)

        # Print validation metrics
        print(f"Validation Loss: {avg_val_loss:.4f}")
        print(f"Validation Accuracy (Exact Match): {val_accuracy:.4f}")
        print(f"Validation Jaccard Similarity: {avg_jaccard:.4f}")
        print(f"Val Precision (micro): {val_precision_micro:.4f}")
        print(f"Val Recall (micro):    {val_recall_micro:.4f}")
        print(f"Val F1 Score (micro):  {val_f1_micro:.4f}")
        print(f"Val Precision (macro): {precision_macro:.4f}")
        print(f"Val Recall (macro):    {recall_macro:.4f}")
        print(f"Val F1 Score (macro):  {f1_macro:.4f}")
        #  ////////////////

    # PLOT TRAINING AND VALIDATION METRICS
    epochs_range = range(1, epochs + 1)
    plt.figure(figsize=(15, 5))

    # PLOT 1: TRAINING & VALIDATION LOSS
    plt.subplot(1, 3, 1)
    plt.plot(epochs_range, train_losses, label="Train Loss", color='blue')
    plt.plot(epochs_range, val_losses, label="Val Loss", color='orange')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)

    # PLOT 2: VALIDATION ACCURACY
    plt.subplot(1, 3, 2)
    plt.plot(epochs_range, val_accuracies, label="Exact Match", color='green')
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Validation Accuracy")
    plt.grid(True)

    # PLOT 3: JACCARD SIMILARITY
    plt.subplot(1, 3, 3)
    plt.plot(epochs_range, val_jaccards, label="Jaccard", color='red')
    plt.xlabel("Epoch")
    plt.ylabel("Similarity")
    plt.title("Jaccard Similarity")
    plt.grid(True)

    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, "training_metrics.png"))
    plt.show()

    # SAVE MODEL AND TOKENIZER TO DIRECTORY
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

    log_file.close()
    print(f"Validation predictions logged to: {log_file_path}")

    return model, test_loader

# Evaluate the model and print metrics
def evaluate_model(model, dataloader, mlb, threshold=0.3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(outputs.logits).cpu().numpy()
            all_probs.extend(probs)
            all_labels.extend(labels.cpu().numpy())

    y_prob = np.array(all_probs)
    y_true = np.array(all_labels)
    y_pred = (y_prob > threshold).astype(int)

    print("\nEvaluation Metrics:")
    print(f"Precision (micro): {precision_score(y_true, y_pred, average='micro', zero_division=0):.4f}")
    print(f"Recall (micro):    {recall_score(y_true, y_pred, average='micro', zero_division=0):.4f}")
    print(f"F1 Score (micro):  {f1_score(y_true, y_pred, average='micro', zero_division=0):.4f}")
    print(f"Precision (macro): {precision_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
    print(f"Recall (macro):    {recall_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
    print(f"F1 Score (macro):  {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
    print(f"Exact Match:       {np.mean(np.all(y_true == y_pred, axis=1)):.4f}")
    print(f"Token Overlap:     {jaccard_similarity(y_true, y_pred, mlb.classes_):.4f}")

# Main entry point
def main(csv_path, model_dir="./myModel", subset_fraction=1.0):
    summaries, skill_lists = load_data_from_csv(csv_path)

    # USE ONLY A FRACTION OF THE DATA
    if 0 < subset_fraction < 1.0:
        total_size = len(summaries)
        subset_size = int(total_size * subset_fraction)
        subset_indices = random.sample(range(total_size), subset_size)

        summaries = [summaries[i] for i in subset_indices]
        skill_lists = [skill_lists[i] for i in subset_indices]
        print(f"Using a subset of the data: {subset_size} out of {total_size} entries")

    # Encode labels
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(skill_lists)
    joblib.dump(mlb, "mlb.joblib")

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(mlb.classes_))

    dataset = JobSkillDataset(summaries, y, tokenizer)

    # Train model and get test data
    model, test_loader = train_model(dataset, model, tokenizer, mlb, model_dir, batch_size=16, epochs=5, threshold=0.2)

    # Evaluate on test data
    evaluate_model(model, test_loader, mlb, threshold=0.2)

# Execute script
if __name__ == "__main__":
  csv_file="tech_industry_top500skills.csv"
  model_dir= f"./myModel_{os.path.basename(csv_file).split('.')[0]}"
  main(csv_file, model_dir, subset_fraction=1)  # Replace with your actual dataset path


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/5


100%|██████████| 10571/10571 [1:04:00<00:00,  2.75it/s]

Average Training Loss: 0.0329





[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Jaccard Similarity: 0.1794

--------------------------------------------------------------------------------

Expected Skills: [' budgeting', ' communication', ' design', ' documentation', ' leadership', ' manufacturing', ' safety', ' testing', 'mechanical engineering']

Predicted Skills: [' communication', ' leadership', ' project management', 'mechanical engineering']

Jaccard Similarity: 0.1794

--------------------------------------------------------------------------------

Expected Skills: [' analytical skills', ' coaching', ' communication', ' compassion', ' conflict resolution', ' critical care', ' critical thinking', ' decisionmaking', ' delegation', ' empathy', ' leadership', ' medication administration', ' mentoring', ' organizational skills', ' problemsolving', ' professional development', ' radiology', ' social work', ' supervision', ' teamwork', ' time management', ' wound care', 'nursing']

Predicted Skill

100%|██████████| 10571/10571 [1:04:02<00:00,  2.75it/s]

Average Training Loss: 0.0277





[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Jaccard Similarity: 0.2032

--------------------------------------------------------------------------------

Expected Skills: [' budgeting', ' communication', ' design', ' documentation', ' leadership', ' manufacturing', ' safety', ' testing', 'mechanical engineering']

Predicted Skills: [' communication', ' leadership', ' project management', ' team leadership', 'mechanical engineering']

Jaccard Similarity: 0.2032

--------------------------------------------------------------------------------

Expected Skills: [' analytical skills', ' coaching', ' communication', ' compassion', ' conflict resolution', ' critical care', ' critical thinking', ' decisionmaking', ' delegation', ' empathy', ' leadership', ' medication administration', ' mentoring', ' organizational skills', ' problemsolving', ' professional development', ' radiology', ' social work', ' supervision', ' teamwork', ' time management', ' wound care', 'nursin

100%|██████████| 10571/10571 [1:04:00<00:00,  2.75it/s]

Average Training Loss: 0.0266





[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Jaccard Similarity: 0.2072

--------------------------------------------------------------------------------

Expected Skills: [' budgeting', ' communication', ' design', ' documentation', ' leadership', ' manufacturing', ' safety', ' testing', 'mechanical engineering']

Predicted Skills: [' communication', ' leadership', ' project management', ' team leadership', 'mechanical engineering']

Jaccard Similarity: 0.2072

--------------------------------------------------------------------------------

Expected Skills: [' analytical skills', ' coaching', ' communication', ' compassion', ' conflict resolution', ' critical care', ' critical thinking', ' decisionmaking', ' delegation', ' empathy', ' leadership', ' medication administration', ' mentoring', ' organizational skills', ' problemsolving', ' professional development', ' radiology', ' social work', ' supervision', ' teamwork', ' time management', ' wound care', 'nursin

 73%|███████▎  | 7750/10571 [46:59<17:02,  2.76it/s]

In [1]:
# import csv
# import torch
# import numpy as np
# import joblib
# import re
# import os
# from torch.utils.data import Dataset, DataLoader, random_split
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# from sklearn.preprocessing import MultiLabelBinarizer
# from sklearn.metrics import precision_score, recall_score, f1_score
# from tqdm import tqdm
# from torch.optim import AdamW
# import matplotlib.pyplot as plt
# import random

# def jaccard_similarity_individual(true_skills, pred_skills):
#     """Calculate Jaccard similarity between two sets of skills"""
#     overlap = []
#     for t in true_skills:
#         for p in pred_skills:
#             tokens_t = set(re.findall(r'\w+', t.lower()))
#             tokens_p = set(re.findall(r'\w+', p.lower()))
#             if tokens_t or tokens_p:
#                 overlap.append(len(tokens_t & tokens_p) / len(tokens_t | tokens_p))
#     return np.mean(overlap) if overlap else 0.0

# def jaccard_similarity(y_true, y_pred, mlb_classes):
#     """Calculate average Jaccard similarity across all samples"""
#     scores = []
#     for true_row, pred_row in zip(y_true, y_pred):
#         true_skills = [mlb_classes[i] for i in np.where(true_row == 1)[0]]
#         pred_skills = [mlb_classes[i] for i in np.where(pred_row == 1)[0]]

#         # Skip if either list is empty
#         if not true_skills or not pred_skills:
#             continue

#         scores.append(jaccard_similarity_individual(true_skills, pred_skills))
#     return np.mean(scores) if scores else 0.0

# # Custom dataset class
# class JobSkillDataset(Dataset):
#     def __init__(self, summaries, labels, tokenizer, max_len=256):
#         self.summaries = summaries
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.summaries)

#     def __getitem__(self, idx):
#         summary = str(self.summaries[idx])
#         label = self.labels[idx]
#         encoding = self.tokenizer.encode_plus(
#             summary,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding='max_length',
#             truncation=True,
#             return_attention_mask=True,
#             return_tensors='pt'
#         )
#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#             'labels': torch.tensor(label, dtype=torch.float)
#         }

# # Load dataset from CSV using csv module
# def load_data_from_csv(file_path):
#     summaries = []
#     skill_lists = []
#     with open(file_path, newline='', encoding='utf-8') as f:
#         reader = csv.DictReader(f)
#         for row in reader:
#             if 'filtered_skills' in row and 'job_summary' in row:
#                 # Filter out empty skills
#                 skills = [s.strip() for s in row['filtered_skills'].split(',') if s.strip()]
#                 if skills:  # Only add if there are actual skills
#                     summaries.append(row['job_summary'])
#                     skill_lists.append(skills)
#     return summaries, skill_lists

# # TRAIN FUNCTION WITH TRAINING AND VALIDATION METRICS TRACKING
# def train_model(dataset, model, tokenizer, mlb, save_dir, epochs=12, batch_size=8, learning_rate=5e-5, pos_class_weight=10.0):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model.to(device)
#     print(f"Using device: {device}")

#     # SPLIT DATASET INTO TRAIN, VALIDATION, AND TEST SETS
#     total_size = len(dataset)
#     train_size = int(0.8 * total_size)
#     val_size = int(0.1 * total_size)
#     test_size = total_size - train_size - val_size
#     train_ds, val_ds, test_ds = random_split(dataset, [train_size, val_size, test_size])

#     cpu_count = os.cpu_count() or 1  # Fallback to 1 if None
#     workers = max(1, cpu_count-1)  # Use at least 1 worker

#     train_loader = DataLoader(train_ds, batch_size=batch_size, num_workers=workers, shuffle=True)
#     val_loader = DataLoader(val_ds, batch_size=batch_size, num_workers=workers)
#     test_loader = DataLoader(test_ds, batch_size=batch_size)

#     optimizer = AdamW(model.parameters(), lr=learning_rate)

#     # LISTS TO TRACK METRICS
#     train_losses = []
#     val_losses = []
#     val_accuracies = []
#     val_jaccards = []  # Track Jaccard similarities

#     os.makedirs(save_dir, exist_ok=True)
#     log_file_path = os.path.join(save_dir, "validation_predictions.log")
#     log_file = open(log_file_path, "w", encoding="utf-8")

#     best_val_loss = float('inf')
#     best_model = None

#     for epoch in range(epochs):
#         # TRAINING PHASE
#         model.train()
#         total_loss = 0
#         print(f"\nEpoch {epoch+1}/{epochs}")

#         for batch in tqdm(train_loader, desc="Training"):
#             optimizer.zero_grad()
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)

#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#             logits = outputs.logits

#             # Custom loss with positive class weight to handle imbalance
#             bce_loss = torch.nn.BCEWithLogitsLoss(pos_weight=torch.ones(labels.shape[1]).to(device) * pos_class_weight)
#             loss = bce_loss(logits, labels)

#             total_loss += loss.item()
#             loss.backward()
#             optimizer.step()

#         avg_train_loss = total_loss / len(train_loader)
#         train_losses.append(avg_train_loss)
#         print(f"Average Training Loss: {avg_train_loss:.4f}")

#         # VALIDATION PHASE
#         model.eval()
#         val_loss = 0
#         correct_preds = 0
#         total_preds = 0
#         all_jaccard_scores = []

#         with torch.no_grad():
#             for batch_idx, batch in enumerate(tqdm(val_loader, desc="Validation")):
#                 input_ids = batch['input_ids'].to(device)
#                 attention_mask = batch['attention_mask'].to(device)
#                 labels = batch['labels'].to(device)

#                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#                 logits = outputs.logits

#                 # Same custom loss as in training
#                 bce_loss = torch.nn.BCEWithLogitsLoss(pos_weight=torch.ones(labels.shape[1]).to(device) * pos_class_weight)
#                 loss = bce_loss(logits, labels)
#                 val_loss += loss.item()

#                 probs = torch.sigmoid(logits)
#                 preds = (probs > 0.1).float()  # Lower threshold to get more predictions

#                 # Calculate exact match counts
#                 correct_preds += torch.sum((preds == labels).all(dim=1)).item()
#                 total_preds += labels.size(0)

#                 # Log predictions for a few batches only to avoid overwhelming output
#                 if batch_idx < 2:  # Only log first 2 batches
#                     for i in range(min(3, len(labels))):  # And only first 3 samples of each batch
#                         true_indices = labels[i].cpu().numpy().nonzero()[0]
#                         pred_indices = preds[i].cpu().numpy().nonzero()[0]

#                         true_skills = [mlb.classes_[idx] for idx in true_indices]
#                         pred_skills = [mlb.classes_[idx] for idx in pred_indices]

#                         # Calculate individual Jaccard similarity
#                         if true_skills and pred_skills:
#                             ind_jaccard = jaccard_similarity_individual(true_skills, pred_skills)
#                             all_jaccard_scores.append(ind_jaccard)
#                         else:
#                             ind_jaccard = 0.0

#                         # Log to file and console
#                         log_str = (
#                             f"Sample {i} (Batch {batch_idx}):\n"
#                             f"  Expected: {true_skills}\n"
#                             f"  Predicted: {pred_skills}\n"
#                             f"  Individual Jaccard: {ind_jaccard:.4f}\n"
#                             f"{'-' * 80}\n"
#                         )
#                         log_file.write(log_str)
#                         print(log_str)

#                 # Calculate batch Jaccard scores for all samples
#                 batch_true = labels.cpu().numpy()
#                 batch_pred = preds.cpu().numpy()

#                 for i in range(len(batch_true)):
#                     true_indices = np.where(batch_true[i] == 1)[0]
#                     pred_indices = np.where(batch_pred[i] == 1)[0]

#                     true_skills = [mlb.classes_[idx] for idx in true_indices]
#                     pred_skills = [mlb.classes_[idx] for idx in pred_indices]

#                     if true_skills and pred_skills:
#                         ind_jaccard = jaccard_similarity_individual(true_skills, pred_skills)
#                         all_jaccard_scores.append(ind_jaccard)

#         # Calculate validation metrics
#         avg_val_loss = val_loss / len(val_loader)
#         val_losses.append(avg_val_loss)

#         val_accuracy = correct_preds / total_preds
#         val_accuracies.append(val_accuracy)

#         avg_jaccard = np.mean(all_jaccard_scores) if all_jaccard_scores else 0.0
#         val_jaccards.append(avg_jaccard)

#         # Print validation metrics
#         print(f"Validation Loss: {avg_val_loss:.4f}")
#         print(f"Validation Accuracy (Exact Match): {val_accuracy:.4f}")
#         print(f"Validation Jaccard Similarity: {avg_jaccard:.4f}")

#         # Save best model
#         if avg_val_loss < best_val_loss:
#             best_val_loss = avg_val_loss
#             best_model = {
#                 'epoch': epoch + 1,
#                 'model_state_dict': model.state_dict(),
#                 'optimizer_state_dict': optimizer.state_dict(),
#                 'val_loss': avg_val_loss,
#             }
#             print(f"New best model saved at epoch {epoch+1}")

#     # Save the best model
#     if best_model:
#         torch.save(best_model, os.path.join(save_dir, "best_model.pt"))
#         print(f"Best model from epoch {best_model['epoch']} saved")

#     # PLOT TRAINING AND VALIDATION METRICS
#     epochs_range = range(1, epochs + 1)
#     plt.figure(figsize=(15, 5))

#     # PLOT 1: TRAINING AND VALIDATION LOSS
#     plt.subplot(1, 3, 1)
#     plt.plot(epochs_range, train_losses, label="Train Loss", color='blue')
#     plt.plot(epochs_range, val_losses, label="Val Loss", color='orange')
#     plt.xlabel("Epoch")
#     plt.ylabel("Loss")
#     plt.title("Training and Validation Loss")
#     plt.legend()
#     plt.grid(True)

#     # PLOT 2: VALIDATION ACCURACY
#     plt.subplot(1, 3, 2)
#     plt.plot(epochs_range, val_accuracies, label="Accuracy", color='green')
#     plt.xlabel("Epoch")
#     plt.ylabel("Accuracy")
#     plt.title("Validation Accuracy (Exact Match)")
#     plt.grid(True)

#     # PLOT 3: VALIDATION JACCARD SIMILARITY
#     plt.subplot(1, 3, 3)
#     plt.plot(epochs_range, val_jaccards, label="Jaccard", color='red')
#     plt.xlabel("Epoch")
#     plt.ylabel("Jaccard Similarity")
#     plt.title("Validation Jaccard Similarity")
#     plt.grid(True)

#     plt.tight_layout()
#     plt.savefig(os.path.join(save_dir, "training_metrics.png"))
#     plt.show()

#     # SAVE MODEL AND TOKENIZER
#     model.save_pretrained(save_dir)
#     tokenizer.save_pretrained(save_dir)

#     log_file.close()
#     print(f"Validation predictions logged to: {log_file_path}")

#     return model, test_loader

# # Evaluate the model and print metrics
# def evaluate_model(model, dataloader, mlb, threshold=0.3):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model.eval()
#     all_probs = []
#     all_labels = []

#     print("\nEvaluating model on test set...")
#     with torch.no_grad():
#         for batch in tqdm(dataloader, desc="Testing"):
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)

#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#             probs = torch.sigmoid(outputs.logits).cpu().numpy()
#             all_probs.extend(probs)
#             all_labels.extend(labels.cpu().numpy())

#     y_prob = np.array(all_probs)
#     y_true = np.array(all_labels)

#     # Try different thresholds to find optimal one
#     thresholds = [0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5]
#     best_f1 = 0
#     best_threshold = threshold

#     print("\nFinding optimal threshold:")
#     for t in thresholds:
#         y_pred = (y_prob > t).astype(int)
#         f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
#         print(f"Threshold {t:.1f} - F1 Score: {f1:.4f}")
#         if f1 > best_f1:
#             best_f1 = f1
#             best_threshold = t

#     print(f"\nUsing threshold: {best_threshold}")
#     y_pred = (y_prob > best_threshold).astype(int)

#     print("\nEvaluation Metrics:")
#     print(f"Precision (micro): {precision_score(y_true, y_pred, average='micro', zero_division=0):.4f}")
#     print(f"Recall (micro):    {recall_score(y_true, y_pred, average='micro', zero_division=0):.4f}")
#     print(f"F1 Score (micro):  {f1_score(y_true, y_pred, average='micro', zero_division=0):.4f}")
#     print(f"Precision (macro): {precision_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
#     print(f"Recall (macro):    {recall_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
#     print(f"F1 Score (macro):  {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
#     print(f"Exact Match:       {np.mean(np.all(y_true == y_pred, axis=1)):.4f}")
#     print(f"Token Overlap:     {jaccard_similarity(y_true, y_pred, mlb.classes_):.4f}")

#     # Print some example predictions
#     print("\nExample Predictions:")
#     for i in range(min(5, len(y_true))):
#         true_indices = np.where(y_true[i] == 1)[0]
#         pred_indices = np.where(y_pred[i] == 1)[0]

#         true_skills = [mlb.classes_[idx] for idx in true_indices]
#         pred_skills = [mlb.classes_[idx] for idx in pred_indices]

#         print(f"Example {i+1}:")
#         print(f"  True: {true_skills}")
#         print(f"  Pred: {pred_skills}")
#         print(f"  Top probabilities:")

#         # Get top 5 predicted skills by probability
#         top_indices = np.argsort(y_prob[i])[::-1][:5]
#         for idx in top_indices:
#             print(f"    {mlb.classes_[idx]}: {y_prob[i][idx]:.4f}")
#         print("-" * 40)

# # Check model predictions on examples
# def check_predictions(model, tokenizer, mlb, examples, threshold=0.1):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model.to(device)
#     model.eval()

#     for i, example in enumerate(examples):
#         encoding = tokenizer.encode_plus(
#             example,
#             add_special_tokens=True,
#             max_length=256,
#             padding='max_length',
#             truncation=True,
#             return_attention_mask=True,
#             return_tensors='pt'
#         )

#         input_ids = encoding['input_ids'].to(device)
#         attention_mask = encoding['attention_mask'].to(device)

#         with torch.no_grad():
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#             logits = outputs.logits
#             probs = torch.sigmoid(logits).cpu().numpy()[0]

#         # Get top predictions
#         top_indices = np.argsort(probs)[::-1][:10]  # Top 10 predictions

#         print(f"\nExample {i+1}: {example[:100]}...")
#         print("Top predicted skills:")
#         for idx in top_indices:
#             if probs[idx] > threshold:
#                 print(f"  {mlb.classes_[idx]}: {probs[idx]:.4f}")

#         # Show all predictions above threshold
#         pred_indices = np.where(probs > threshold)[0]
#         pred_skills = [mlb.classes_[idx] for idx in pred_indices]
#         print(f"All predictions above threshold {threshold}: {pred_skills}")

# # Main entry point
# def main(csv_path, model_dir="./myModel", subset_fraction=1.0):
#     print(f"Loading data from {csv_path}")
#     summaries, skill_lists = load_data_from_csv(csv_path)
#     print(f"Loaded {len(summaries)} samples")

#     # Print a few examples to understand the data
#     print("\nExample job summaries:")
#     for i in range(min(3, len(summaries))):
#         print(f"\nExample {i+1}: {summaries[i][:200]}...")
#         print(f"Skills: {skill_lists[i]}")

#     # USE ONLY A FRACTION OF THE DATA
#     if 0 < subset_fraction < 1.0:
#         total_size = len(summaries)
#         subset_size = int(total_size * subset_fraction)
#         random.seed(42)  # For reproducibility
#         subset_indices = random.sample(range(total_size), subset_size)

#         summaries = [summaries[i] for i in subset_indices]
#         skill_lists = [skill_lists[i] for i in subset_indices]
#         print(f"Using a subset of the data: {subset_size} out of {total_size} entries")

#     # Encode labels
#     mlb = MultiLabelBinarizer()
#     y = mlb.fit_transform(skill_lists)
#     joblib.dump(mlb, os.path.join(model_dir, "mlb.joblib"))

#     print(f"Total unique skills: {len(mlb.classes_)}")
#     print(f"Sample skills: {mlb.classes_[:10]}")

#     # Count label distribution
#     label_counts = np.sum(y, axis=0)
#     top_skills = [(mlb.classes_[i], count) for i, count in enumerate(label_counts)]
#     top_skills.sort(key=lambda x: x[1], reverse=True)

#     print("\nTop 10 most common skills:")
#     for skill, count in top_skills[:10]:
#         print(f"  {skill}: {count}")

#     print("\nBottom 10 least common skills:")
#     for skill, count in top_skills[-10:]:
#         print(f"  {skill}: {count}")

#     # Initialize model and tokenizer
#     tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
#     model = DistilBertForSequenceClassification.from_pretrained(
#         'distilbert-base-uncased',
#         num_labels=len(mlb.classes_),
#         problem_type="multi_label_classification"  # Explicitly set multi-label
#     )

#     # Check class imbalance and set appropriate loss weights
#     num_samples = len(y)
#     pos_weights = torch.tensor(
#         [num_samples / max(1, np.sum(y[:, i])) for i in range(y.shape[1])],
#         dtype=torch.float
#     )
#     print(f"Positive class weights range: {torch.min(pos_weights).item():.2f} - {torch.max(pos_weights).item():.2f}")

#     # Create dataset
#     dataset = JobSkillDataset(summaries, y, tokenizer)
#     print(f"Dataset created with {len(dataset)} samples")

#     # Calculate average number of skills per document
#     avg_skills_per_doc = np.mean(np.sum(y, axis=1))
#     print(f"Average skills per document: {avg_skills_per_doc:.2f}")

#     # Train model and get test data
#     model, test_loader = train_model(
#         dataset,
#         model,
#         tokenizer,
#         mlb,
#         model_dir,
#         epochs=15,  # Increase epochs for better learning
#         batch_size=16,  # Smaller batch size for better convergence
#         learning_rate=3e-5,  # Adjusted learning rate
#         pos_class_weight=25.0  # High positive class weight to encourage predictions
#     )

#     # Evaluate on test data
#     evaluate_model(model, test_loader, mlb)

#     # Check model on a few examples to confirm it's making predictions
#     test_examples = [
#         "We are looking for a software engineer with experience in Python, JavaScript and cloud platforms.",
#         "Data analyst position requiring SQL, Excel, and strong communication skills.",
#         "Project manager needed with agile methodologies experience and team leadership skills."
#     ]

#     check_predictions(model, tokenizer, mlb, test_examples, threshold=0.05)

# # Execute script
# if __name__ == "__main__":
#     csv_file = "tech_industry_top500skills.csv"
#     model_dir = f"./myModel_{os.path.basename(csv_file).split('.')[0]}"
#     main(csv_file, model_dir, subset_fraction=0.2)

Loading data from tech_industry_top500skills.csv
Loaded 108509 samples

Example job summaries:

Example 1: Location: Melbourne CBD Job Type: Permanent Posted: 30 days ago Contact: Karina Park Discipline General IT Reference: 259632 Main Activities Designing, coding, testing and installing applications prog...
Skills: ['troubleshooting', 'documentation', 'software development', 'testing', 'quality assurance', 'training', 'problemsolving', 'administration', 'evaluation']

Example 2: Our Water team takes a holistic and sustainable approach to managing the use of water resources. We support governments, investors, developers, manufacturers, utilities, and corporations by mitigating...
Skills: ['project management', 'risk management', 'mentoring', 'business development', 'networking', 'innovation', 'analytical skills', 'collaboration', 'communication', 'problem solving', 'critical thinking', 'strategic thinking', 'decision making', 'leadership', 'negotiation', 'team work', 'safety']

Exampl

FileNotFoundError: [Errno 2] No such file or directory: './myModel_tech_industry_top500skills/mlb.joblib'

In [None]:
# import csv
# import random
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import MultiLabelBinarizer
# from sklearn.metrics import precision_score, recall_score, jaccard_score, accuracy_score
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# from transformers import Trainer, TrainingArguments
# import torch
# from torch.utils.data import Dataset, DataLoader
# from tqdm import tqdm

# # Configuration
# SUBSET_FRACTION = .2  # Set between 0.1-1.0 to use fraction of data
# MODEL_NAME = "distilbert-base-uncased"
# BATCH_SIZE = 32  # Optimal for 211k samples on most GPUs
# NUM_EPOCHS = 3
# MAX_LENGTH = 256  # Sequence length for BERT
# RANDOM_SEED = 42

# # Set random seeds for reproducibility
# random.seed(RANDOM_SEED)
# np.random.seed(RANDOM_SEED)
# torch.manual_seed(RANDOM_SEED)
# if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(RANDOM_SEED)

# # 1. Data Loading and Preprocessing
# def load_and_preprocess_data(filename):
#     """Load data from CSV and apply subset sampling if needed"""
#     texts = []
#     skill_lists = []

#     with open(filename, 'r', encoding='utf-8') as f:
#         reader = csv.DictReader(f)
#         for row in reader:
#             texts.append(row['job_summary'])
#             skill_lists.append([s.strip() for s in row['filtered_skills'].split(',')])

#     # Apply subset selection if needed
#     if 0 < SUBSET_FRACTION < 1.0:
#         total_size = len(texts)
#         subset_size = int(total_size * SUBSET_FRACTION)
#         indices = random.sample(range(total_size), subset_size)
#         texts = [texts[i] for i in indices]
#         skill_lists = [skill_lists[i] for i in indices]
#         print(f"Using subset: {subset_size}/{total_size} samples")

#     return texts, skill_lists

# # Load data
# texts, skill_lists = load_and_preprocess_data("tech_industry_top500skills.csv")

# # 2. Multilabel Encoding
# mlb = MultiLabelBinarizer()
# skill_labels = mlb.fit_transform(skill_lists)
# num_classes = len(mlb.classes_)
# print(f"Total skill categories: {num_classes}")

# # 3. Train/Val/Test Split (70/15/15)
# train_texts, temp_texts, train_labels, temp_labels = train_test_split(
#     texts, skill_labels, test_size=0.3, random_state=RANDOM_SEED
# )
# val_texts, test_texts, val_labels, test_labels = train_test_split(
#     temp_texts, temp_labels, test_size=0.5, random_state=RANDOM_SEED
# )

# # 4. PyTorch Dataset Class
# class JobSkillDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_length):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = str(self.texts[idx])
#         encoding = self.tokenizer(
#             text,
#             max_length=self.max_length,
#             padding="max_length",
#             truncation=True,
#             return_tensors="pt"
#         )

#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#             'labels': torch.FloatTensor(self.labels[idx])
#         }

# # 5. Initialize Tokenizer and Model
# tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
# model = DistilBertForSequenceClassification.from_pretrained(
#     MODEL_NAME,
#     num_labels=num_classes,
#     problem_type="multi_label_classification"
# )

# # 6. Create DataLoaders
# train_dataset = JobSkillDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
# val_dataset = JobSkillDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
# test_dataset = JobSkillDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)

# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
# test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# # 7. Training Setup
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# # 8. Training Loop
# for epoch in range(NUM_EPOCHS):
#     model.train()
#     total_loss = 0

#     for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
#         optimizer.zero_grad()

#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)

#         outputs = model(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             labels=labels
#         )

#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#     print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_loader):.4f}")

# # 9. Evaluation Function
# def evaluate(model, dataloader, threshold=0.5):
#     model.eval()
#     predictions = []
#     true_labels = []

#     with torch.no_grad():
#         for batch in dataloader:
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)

#             outputs = model(
#                 input_ids=input_ids,
#                 attention_mask=attention_mask
#             )

#             logits = outputs.logits
#             preds = (torch.sigmoid(logits) > threshold)
#             predictions.extend(preds.cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())

#     return {
#         'precision': precision_score(true_labels, predictions, average='samples'),
#         'recall': recall_score(true_labels, predictions, average='samples'),
#         'jaccard': jaccard_score(true_labels, predictions, average='samples'),
#         'exact_match': accuracy_score(true_labels, predictions)
#     }

# # 10. Run Evaluation
# val_metrics = evaluate(model, val_loader)
# test_metrics = evaluate(model, test_loader)

# print("\nValidation Metrics:")
# for metric, value in val_metrics.items():
#     print(f"{metric:>12}: {value:.4f}")

# print("\nTest Metrics:")
# for metric, value in test_metrics.items():
#     print(f"{metric:>12}: {value:.4f}")

# # 11. Save Model and Label Encoder
# torch.save(model.state_dict(), "distilbert_job_skills.pth")
# import pickle
# with open('label_encoder.pkl', 'wb') as f:
#     pickle.dump(mlb, f)

Using subset: 24119/120595 samples
Total skill categories: 500


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 528/528 [09:35<00:00,  1.09s/it]


Epoch 1 Loss: 0.1585


Epoch 2: 100%|██████████| 528/528 [09:39<00:00,  1.10s/it]


Epoch 2 Loss: 0.0663


Epoch 3: 100%|██████████| 528/528 [09:37<00:00,  1.09s/it]


Epoch 3 Loss: 0.0628


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Validation Metrics:
   precision: 0.1473
      recall: 0.0231
     jaccard: 0.0230
 exact_match: 0.0019

Test Metrics:
   precision: 0.1414
      recall: 0.0212
     jaccard: 0.0211
 exact_match: 0.0008
