In [8]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Load the datasets
train_df = pd.read_csv('train_modified.csv')
val_df = pd.read_csv('val_modified.csv')
test_df = pd.read_csv('test_modified.csv')

# Replace NaN values in the text column with an empty string
train_df['text'].fillna('', inplace=True)
val_df['text'].fillna('', inplace=True)
test_df['text'].fillna('', inplace=True)

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the encoder on the job categories and transform them to numerical values
train_df['job_category'] = label_encoder.fit_transform(train_df['job_category'])
val_df['job_category'] = label_encoder.transform(val_df['job_category'])
test_df['job_category'] = label_encoder.transform(test_df['job_category'])

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_data(df, tokenizer, max_len=512):
    input_ids = []
    attention_masks = []

    for text in df['text']:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(df['job_category'].values, dtype=torch.long)

train_inputs, train_masks, train_labels = preprocess_data(train_df, tokenizer, max_len=256)
val_inputs, val_masks, val_labels = preprocess_data(val_df, tokenizer, max_len=256)
test_inputs, test_masks, test_labels = preprocess_data(test_df, tokenizer, max_len=256)

# Create DataLoader
batch_size = 4

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Use GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Create the learning rate scheduler
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Number of steps to accumulate gradients
accumulation_steps = 4

# Training loop
for epoch in range(epochs):
    model.train()
    
    total_loss = 0
    optimizer.zero_grad()
    
    for step, batch in enumerate(train_dataloader):
        batch_inputs, batch_masks, batch_labels = tuple(t.to(device) for t in batch)
        
        outputs = model(batch_inputs, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        # Normalize loss to account for batch accumulation
        loss = loss / accumulation_steps
        loss.backward()
        
        if (step + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}/{epochs}, Training loss: {avg_train_loss}')


# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    
    for batch in dataloader:
        batch_inputs, batch_masks, batch_labels = tuple(t.to(device) for t in batch)
        
        with torch.no_grad():
            outputs = model(batch_inputs, attention_mask=batch_masks)
        
        logits = outputs.logits
        predictions.append(logits.argmax(dim=1).cpu().numpy())
        true_labels.append(batch_labels.cpu().numpy())
    
    predictions = np.concatenate(predictions)
    true_labels = np.concatenate(true_labels)
    
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    
    return accuracy, report

# Evaluate on validation set
val_accuracy, val_report = evaluate(model, val_dataloader)
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation Classification Report:\n{val_report}')

# Evaluate on test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

test_accuracy, test_report = evaluate(model, test_dataloader)
print(f'Test Accuracy: {test_accuracy}')
print(f'Test Classification Report:\n{test_report}')

# Save the model and tokenizer
output_dir = './ResumeClassifier_model/'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Save the label encoder classes
np.save(output_dir + 'label_classes.npy', label_encoder.classes_)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch 1/3, Training loss: 2.9495693804203778
Epoch 2/3, Training loss: 1.8361750937056267
Epoch 3/3, Training loss: 1.0921096778464043


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.7882037533512064
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       0.95      0.90      0.92        20
           2       1.00      0.17      0.29        12
           3       0.26      0.55      0.35        11
           4       0.90      0.32      0.47        28
           5       0.00      0.00      0.00        11
           6       0.86      0.90      0.88        21
           7       0.55      0.50      0.52        12
           8       0.00      0.00      0.00         2
           9       0.94      1.00      0.97        17
          10       0.93      0.88      0.90        16
          11       0.95      1.00      0.98        20
          12       0.64      1.00      0.78         9
          13       0.96      0.93      0.94        27
          14       0.79      0.73      0.76        15
          15       0.95      1.00      0.97        19
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test Accuracy: 0.7828418230563002
Test Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86        15
           1       0.92      0.61      0.73        18
           2       1.00      0.33      0.50        12
           3       0.47      0.47      0.47        17
           4       1.00      0.18      0.31        11
           5       0.00      0.00      0.00         2
           6       0.69      0.69      0.69        13
           7       0.67      0.57      0.62        14
           8       0.00      0.00      0.00         5
           9       0.86      0.95      0.90        20
          10       0.92      1.00      0.96        12
          11       0.95      0.78      0.86        23
          12       0.74      0.96      0.84        24
          13       0.91      1.00      0.95        10
          14       0.80      0.71      0.75        17
          15       0.81      1.00      0.89        17
          16       