In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
import pandas as pd
from datasets import Dataset,load_from_disk
from peft import LoraConfig, get_peft_model
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from accelerate import Accelerator
# !pip install tensorboard

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_excel("./all_jobs - Copy.xlsx") 
df = df[['title', 'company', 'min_amount', 'max_amount','mean_salary', 'currency', 'is_remote', 'cleaned_description']]
df = df.head(5000)
df.fillna("", inplace=True)
# Clean and convert salary columns before feeding into Dataset
df["min_amount"] = pd.to_numeric(df["min_amount"], errors="coerce")
df["max_amount"] = pd.to_numeric(df["max_amount"], errors="coerce")
df["mean_salary"] = pd.to_numeric(df["mean_salary"], errors="coerce")
df["is_remote"] = pd.to_numeric(df["is_remote"], errors="coerce")


# Optionally fill NaNs with a default value (e.g., 0 or None)
df["min_amount"].fillna(0, inplace=True)
df["max_amount"].fillna(0, inplace=True)
df["mean_salary"].fillna(0, inplace=True)
df["is_remote"].fillna(0, inplace=True)


  df.fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["min_amount"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["max_amount"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [3]:
df.head(10)[["company", "min_amount",'cleaned_description']]

Unnamed: 0,company,min_amount,cleaned_description
0,RISCPoint,105000.0,riscpoint is seeking a conceptual thinker with...
1,MarketAxess Holdings,150000.0,company description marketaxess is on a journe...
2,Global Resource Solutions,84617.0,"global resource solutions, inc. (grs) is seeki..."
3,"Cole Engineering Services, Inc",105000.0,**company overview:** by light professional it...
4,MUFG,83000.0,**do you want your voice heard and your action...
5,Zion Zest LLC,117239.0,**overview** we are seeking a knowledgeable an...
6,Ngrecruitments,0.0,**role: identity and access management sailpoi...
7,MobileIT LLC,68430.0,**job summary** we are seeking a skilled and m...
8,VikingCloud,126318.0,**senior security consultan****t** **location:...
9,"Innovation Associates, Inc.",115023.0,**the role:** as a sr. cloud architect you wil...


In [4]:
def generate_qa_pairs(row):
    
    base = (
        f"Title: {row['title']}\n"
        f"Company: {row['company']}\n"
        f"Salary: {row['min_amount']}-{row['max_amount']} {row['currency']}\n"
        f"Remote: {'Yes' if row['is_remote'] else 'No'}\n"
        f"description: {row['cleaned_description']}\n"
    )
    return {
        "question": f"Tell me about {row['title']} position at {row['company']}",
        "answer": base
    }


In [5]:
single_record_df = df.head(500)
dataset  =Dataset.from_pandas(single_record_df).map(generate_qa_pairs)
dataset.save_to_disk("jobs_qa_dataset")
# dataset

Map: 100%|██████████| 500/500 [00:00<00:00, 8581.10 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 500/500 [00:00<00:00, 49019.49 examples/s]


In [6]:
def tokenize_function(examples):
    texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=256)

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    torch_dtype=torch.float16,
    device_map="auto",
    llm_int8_enable_fp32_cpu_offload=True ,
    load_in_4bit=True  # Critical for 100K samples
)

peft_config = LoraConfig(
    r=16,  # Higher rank for complex job data
    lora_alpha=32,
    target_modules=["Wqkv", "fc1", "fc2"],
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.37s/it]


In [8]:
dataset = load_from_disk("./jobs_qa_dataset/").map(tokenize_function, batched=True)

Map: 100%|██████████| 500/500 [00:00<00:00, 2116.68 examples/s]


In [9]:
print(dataset[:1])

{'title': ['Cybersecurity Operations Senior Consultant'], 'company': ['RISCPoint'], 'min_amount': [105000.0], 'max_amount': [135000.0], 'mean_salary': [120000.0], 'currency': ['USD'], 'is_remote': [1.0], 'cleaned_description': ['riscpoint is seeking a conceptual thinker with the ability to understand and lead client cybersecurity programs and objectives at an elevated technical level. ### **description** **about the company** riscpoint is a partner-owned, rapidly growing and leading cybersecurity and compliance consultancy firm. we are a tight-knit team of experienced professionals that focus on integrating seamlessly with our clients to harmonize security and compliance obligations with business success. we are looking for talented professionals, passionate about the industry, to join our team and make a meaningful impact in the industry. we value continuous improvement, personal growth, learning and mentoring. we believe that when we are at our best, we provide the most value to our 

In [None]:
accelerator = Accelerator()
training_args = TrainingArguments(
    output_dir="./phi2-jobs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=1e-4,
    fp16=True,
    save_total_limit=3,
    # save_steps=500, 
    logging_steps=100,
    report_to="tensorboard",
    optim="adamw_bnb_8bit",
    # load_best_model_at_end=True,  # Automatically load the best model
    # metric_for_best_model="eval_loss",  # Metric to track for the best model
    # greater_is_better=False  # For loss, we want to minimize it# 8-bit optimizer
)

trainer = accelerator.prepare(
    Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
) 



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [11]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=62, training_loss=2.4327599310105845, metrics={'train_runtime': 1865.3993, 'train_samples_per_second': 0.536, 'train_steps_per_second': 0.033, 'total_flos': 4006632436531200.0, 'train_loss': 2.4327599310105845, 'epoch': 1.96})