In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer

# Load job posting data
data_path = "../data/jobs_data.csv"
df = pd.read_csv(data_path,quoting=3, error_bad_lines=False)

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Define function to tokenize and encode job postings
def tokenize_postings(texts):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,                      # Job posting text to encode
                            add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
                            max_length = 512,           # Pad or truncate to this length
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks
                            return_tensors = 'pt',      # Return PyTorch tensors
                       )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    return input_ids, attention_masks

# Tokenize and encode job postings
input_ids, attention_masks = tokenize_postings(df['text'].tolist())

# Save tokenized data
output_path = "../data/tokenized_data.pt"
torch.save((input_ids, attention_masks), output_path)
