## Transformer Model

Load and clean the data

In [1]:
from data_transformation import load_data_single_df
from transformers import AutoTokenizer
import sys
sys.path.append('../preprocessing')
from datasets import Dataset
import pandas as pd
import torch

print(torch.__version__)
print(torch.cuda.is_available()) 

TEST_SIZE = 0.2

df = load_data_single_df()
df['combined_text'] = df['title'] + " " + df['description']

# Rename column 'normalized_salary' to 'labels'
df.rename(columns={'normalized_salary': 'labels'}, inplace=True)

# Tokenize job descriptions
def tokenize(text):
    return tokenizer(
        text, 
        padding="max_length", 
        truncation=True, 
        max_length=512, 
        return_tensors="pt",
        return_attention_mask=True,
    )

# company_name_tokens = X['company_name'].apply(lambda x: tokenize(x))
# title_tokens = X['title'].apply(lambda x: tokenize(x))

dataset = Dataset.from_pandas(df[["combined_text", "labels"]])
dataset = dataset.train_test_split(test_size=TEST_SIZE, shuffle=True, seed=42)



# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenized_dataset = dataset.map(
    lambda x: tokenizer(x["combined_text"], padding="max_length", truncation=True, max_length=512),
    batched=True
)
# tokenized_dataset['labels'] = tokenized_dataset['labels']
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

  from .autonotebook import tqdm as notebook_tqdm


2.2.0+cpu-cxx11-abi
False


Map: 100%|██████████| 28281/28281 [00:24<00:00, 1156.85 examples/s]
Map: 100%|██████████| 7071/7071 [00:05<00:00, 1186.43 examples/s]


In [2]:
from transformers import TrainingArguments
from sklearn.metrics import mean_squared_error, r2_score
from transformers import Trainer
from transformers import AutoModelForSequenceClassification

# Model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

# Trainer
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Accumulate gradients
    num_train_epochs=1,
    weight_decay=0.01,
)

# print(tokenized_dataset["train"][0])

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=lambda x: {
        "mse": mean_squared_error(x.label_ids, x.predictions),
        "r2": r2_score(x.label_ids, x.predictions)
    },
)

# 5. Train and evaluate
trainer.train()
trainer.evaluate()

2025-05-11 16:09:00.712321: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746972540.790392   58639 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746972540.813928   58639 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746972540.985205   58639 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746972540.985226   58639 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746972540.985228   58639 computation_placer.cc:177] computation placer alr

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Create PyTorch Dataset
class JobDataset(Dataset):
    def __init__(self, encodings, salaries):
        self.encodings = encodings
        self.salaries = salaries

    def __len__(self):
        return len(self.salaries)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Split data
X_train, X_val, y_train, y_val = train_test_split(X["input_ids"], y, test_size=TEST_SIZE, random_state=42)
# train_loader = DataLoader(JobDataset(X_train, y_train), batch_size=16)

# Create DataLoaders
train_dataset = JobDataset(X_train.tolist(), y_train.tolist())
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = JobDataset(X_val.tolist(), y_val.tolist())
val_loader = DataLoader(val_dataset, batch_size=16)

# # Train
# model.train()
# for epoch in range(3):
#     for batch in train_loader:
#         optimizer.zero_grad()
#         outputs = model(**batch)
#         loss = loss_fn(outputs.logits, batch["labels"])
#         loss.backward()
#         optimizer.step()

In [15]:
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW

# Load pre-trained BERT with a regression head
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=1  # Single output neuron for regression
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss and optimizer
loss_fn = torch.nn.MSELoss()  # Mean Squared Error for regression
optimizer = AdamW(model.parameters(), lr=5e-5)  # Transformer-friendly optimizer


# Training
model.train()
for epoch in range(3):  # 3 epochs
    total_loss = 0
    for batch in train_loader:
        # Move batch to GPU
        input_ids = batch["input_ids"].squeeze(1).to(device)  # Shape: [batch_size, seq_len]
        labels = batch["labels"].to(device)
        attention_mask = batch["attention_mask"].to(device),

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: index 19855 is out of bounds for dimension 0 with size 1