# Generation of job descriptions with tranformer models

### Checking for GPU availability to run the computations


In [1]:
from datasets import Dataset, DatasetDict
import os
import pandas as pd
import sys
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm
2025-05-05 20:22:05.983578: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-05 20:22:06.181716: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746469326.251890    2996 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746469326.271698    2996 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746469326.434199    2996 computation_placer.cc:177] computation placer already r

### Setting root path to project path

In [2]:
project_root = os.path.abspath(
    os.path.join(os.getcwd(), '../../..')
)
if project_root not in sys.path:
    sys.path.append(project_root)

os.environ["TOKENIZERS_PARALLELISM"] = "true"

### Imports

In [3]:
import torch
print(f"GPU available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

GPU available: True
Device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3060 Ti


### Model selection

In [4]:
model_name = "gpt2"  # Or choose another model like "gpt2", "t5-small", etc.
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set pad token if it doesn't exist (common for GPT-2 models)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

print(f"Loaded model: {model_name}")
print(f"Tokenizer pad token: {tokenizer.pad_token}")

Loaded model: gpt2
Tokenizer pad token: <|endoftext|>


### Data preparation

In [5]:
# --- Configuration ---
parquet_file_path = os.path.join(project_root, "data", "processed", "cleaned_postings_modeling.parquet")
text_column = "description" # Your column name
block_size = 512 # Max sequence length for the model
test_size = 0.1 
random_seed = 42

# --- Load Dataframe from Parquet ---
try:
    df = pd.read_parquet(parquet_file_path)
    print(f"Loaded DataFrame from {parquet_file_path}. Shape: {df.shape}")
    print(df.head()) # Optional
    print(df.info()) # Optional
    # filter out any rows with a description shorter than 100 characters
    df = df[df[text_column].str.len() > 100]
    print(f"Filtered DataFrame. New shape: {df.shape}")
    print (df.info())
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in the DataFrame.")
except FileNotFoundError:
    print(f"Error: Parquet file not found at {parquet_file_path}")
    df = None
except Exception as e:
    print(f"Error loading Parquet file: {e}")
    df = None

# --- Convert Pandas DataFrame to Hugging Face Dataset ---
if df is not None:
    # Convert the DataFrame to a single Dataset object first
    full_dataset = Dataset.from_pandas(df)
    print(f"\nConverted DataFrame to Dataset. Size: {len(full_dataset)}")

    # --- Split the Dataset ---
    # Use train_test_split on the Dataset object
    split_datasets = full_dataset.train_test_split(test_size=test_size, seed=random_seed)

    # Rename the default 'test' split to 'validation' if preferred for Trainer, or keep as 'test'
    # Trainer uses 'eval_dataset', so 'validation' or 'test' are common keys. Let's use 'test'.
    # split_datasets['validation'] = split_datasets.pop('test') # Optional rename

    print("\nSplit dataset into training and testing sets:")
    print(split_datasets)

    # Assign to raw_datasets (which is now a DatasetDict with 'train' and 'test')
    raw_datasets = split_datasets

else:
    print("\nSkipping dataset conversion and splitting due to loading error.")
    raw_datasets = None

# --- Tokenization Function (remains the same) ---
def tokenize_function(examples):
    # Tokenize the text
    tokenized_output = tokenizer(examples[text_column], truncation=True, padding="max_length", max_length=block_size)
    # For Causal LM, labels are usually the same as inputs
    tokenized_output["labels"] = tokenized_output["input_ids"].copy()
    return tokenized_output

# --- Apply Tokenization ---
if raw_datasets:
    # Apply tokenization to both splits ('train' and 'test')
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=raw_datasets["train"].column_names # Remove original columns from both splits
    )
    # Set format for PyTorch
    tokenized_datasets.set_format("torch")

    print("\nTokenized dataset structure:")
    print(tokenized_datasets)

    print("\nTokenized training dataset sample:")
    if len(tokenized_datasets["train"]) > 0:
         print(tokenized_datasets["train"][0])
    else:
        print("Tokenized training dataset is empty.")

    # Assign the splits
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["test"] # Use the 'test' split for evaluation

else:
    print("\nSkipping tokenization due to dataset loading/conversion error.")
    train_dataset = None
    eval_dataset = None

Loaded DataFrame from /home/gabriel/dev/SCIA/NLP_Linkedin_offers/data/processed/cleaned_postings_modeling.parquet. Shape: (122124, 4)
                company_name  \
0      Corcoran Sawyer Smith   
1     The National Exemplar    
2     Abrams Fensterman, LLP   
3  Downtown Raleigh Alliance   
4                 Raw Cereal   

                                               title  \
0                              Marketing Coordinator   
1                        Assitant Restaurant Manager   
2  Senior Elder Law / Trusts and Estates Associat...   
3           Economic Development and Planning Intern   
4                                           Producer   

                                         description           location  
0  Job description A leading real estate firm in ...      Princeton, NJ  
1  The National Exemplar is accepting application...     Cincinnati, OH  
2  Senior Associate Attorney Elder Law Trusts and...  New Hyde Park, NY  
3  Job summary The Economic Development 

Map: 100%|██████████| 109849/109849 [00:33<00:00, 3310.02 examples/s]
Map: 100%|██████████| 12206/12206 [00:03<00:00, 3536.18 examples/s]


Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 109849
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12206
    })
})

Tokenized training dataset sample:
{'input_ids': tensor([16844, 10027,  7913,  1596,  2931,  4761, 15768, 12489,   327,   600,
          292,   318,  6095,   281,  2445,  2142,  3862,   284,  8277,   287,
          257,  1430,  3562,   284,  2148,  9815,  3725,  1973,   477,  3006,
          286,   674,  1597,    11,   355,   880,   355,   262,  4708,  2594,
         2831,    13,   383,  3061,   318,   284, 15651,   262,  2445,   284,
          674,  1180, 13346,    13,   383,  2445,  1998, 21001,  2832,   319,
         3047,   351,   262,  3663,   284,  9427,   351,  3294, 13346,   287,
         1502,   284,  1205,  7387,  5531,    11,  4009,    11,  6946,   290,
          640,  4542,  4678,    13, 20389,  9537,  66




### Setup for the training of the model

In [9]:
output_directory = os.path.join(project_root, "models", "src", "generation")
use_fp16 = torch.cuda.is_available() # Enable FP16 only if GPU is available

training_args = TrainingArguments(
    torch_compile=True,
    torch_compile_backend="inductor",
    torch_compile_mode="default", 
    torch_empty_cache_steps=4,
    output_dir=output_directory,
    num_train_epochs=1,  # Start with 1 epoch for testing
    per_device_train_batch_size=2,  # Adjust based on GPU memory
    per_device_eval_batch_size=8, # Batch size for evaluation (can often be larger)
    gradient_accumulation_steps=4, # Increase effective batch size
    learning_rate=5e-5,
    fp16=use_fp16, # Enable mixed precision training if GPU available
    logging_dir=f"{output_directory}/logs",
    logging_strategy="steps", # Log metrics periodically
    logging_steps=100,        # Log every 100 steps
    eval_strategy="steps", # Evaluate periodically
    eval_steps=500,              # Evaluate every 500 steps
    save_strategy="steps",       # Save checkpoints periodically
    save_steps=500,              # Save every 500 steps
    load_best_model_at_end=True, # Load the best model found during evaluation at the end
    metric_for_best_model="loss", # Use evaluation loss to determine the best model (lower is better)
    greater_is_better=False,     # Lower loss is better
    save_total_limit=2,          # Keep only the last 2 checkpoints + the best one
    report_to="none",          # Disable external reporting (like wandb) for now
    weight_decay=0.01,         # Regularization
    dataloader_pin_memory=True, # Pin memory for faster data transfer to GPU
    dataloader_num_workers=12, # Number of workers for data loading, leverage multiple CPU cores for faster data loading to GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if train_dataset else None,
    eval_dataset=eval_dataset if eval_dataset else None, # Pass the evaluation dataset
    tokenizer=tokenizer,
    # data_collator=default_data_collator # Usually not needed for CausalLM unless custom padding
)

print("TrainingArguments and Trainer initialized.")
if eval_dataset:
    print(f"Evaluation dataset size: {len(eval_dataset)}")
print(f"FP16 enabled: {use_fp16}")
print(f"Evaluation strategy: {training_args.eval_strategy}")

TrainingArguments and Trainer initialized.
Evaluation dataset size: 12206
FP16 enabled: True
Evaluation strategy: IntervalStrategy.STEPS


  trainer = Trainer(


### Fine-tuning the model


In [10]:
if train_dataset:
    print("Starting training...")
    try:
        train_result = trainer.train()
        print("Training finished.")
        # You can print some metrics from train_result if needed
        metrics = train_result.metrics
        print(f"Train Output Metrics: {metrics}")
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
    except Exception as e:
        print(f"An error occurred during training: {e}")
else:
    print("Skipping training because the dataset was not loaded properly.")


Starting training...


Step,Training Loss,Validation Loss
500,2.8116,2.785043
1000,2.7672,2.700827
1500,2.7736,2.64004
2000,2.6958,2.593958
2500,2.7241,2.553697
3000,2.6283,2.523878
3500,2.6367,2.500995
4000,2.5631,2.471948
4500,2.636,2.451429
5000,2.5416,2.433481


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Training finished.
Train Output Metrics: {'train_runtime': 8626.2415, 'train_samples_per_second': 12.734, 'train_steps_per_second': 1.592, 'total_flos': 2.8702407131136e+16, 'train_loss': 2.5315315686087607, 'epoch': 0.9999817933545744}
***** train metrics *****
  epoch                    =        1.0
  total_flos               = 26731199GF
  train_loss               =     2.5315
  train_runtime            = 2:23:46.24
  train_samples_per_second =     12.734
  train_steps_per_second   =      1.592


### Save the fine-tuned model

In [11]:
final_model_path = f"{output_directory}/final"

if train_dataset: # Only save if training actually happened
    print(f"Saving final model and tokenizer to {final_model_path}...")
    try:
        trainer.save_model(final_model_path)
        tokenizer.save_pretrained(final_model_path)
        print("Model and tokenizer saved successfully.")
    except Exception as e:
        print(f"Error saving model/tokenizer: {e}")
else:
    print("Skipping final model saving as training did not run.")

Saving final model and tokenizer to /home/gabriel/dev/SCIA/NLP_Linkedin_offers/models/src/generation/final...
Model and tokenizer saved successfully.
