In [None]:
import json
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

from transformers import AutoModel, AutoTokenizer
import evaluate
import glob
import numpy as np

In [None]:
def read_tagged_job_descriptions(data_path):
    
    data =[]
    with open(data_path,'r') as f:
        for line in f:
            json_object = json.loads(line)
            data.append(json_object)
    return data

In [None]:
def make_label_dicts():
    id2label = {0:"ROLE",1:"COMPA",2:"COMPENBEN"}
    label2id = {"ROLE":0,"COMPA":1,"COMPENBEN":2}
    return id2label, label2id

In [None]:
def make_data_for_hf(data):
    
    hf_data = []
    for jd in data:
        for span in jd["spans"]:
            start = span["start"]
            end = span["end"]
            label = span["label"]
            text = jd["text"][start:end]
            hf_data.append({"text":text,"label":label})
    data = {"text": [], "label": []}
    
    for item in hf_data:
        data["text"].append(item["text"])
        data["label"].append(item["label"])
    dataset = Dataset.from_dict(data)
    
    train_test_split = dataset.train_test_split(test_size=0.2)
    train_val_split = train_test_split['train'].train_test_split(test_size=0.25)
    final_splits = {
        'train': train_val_split['train'],
        'validation': train_val_split['test'],
        'test': train_test_split['test']
    }
    final_dataset = DatasetDict(final_splits)
    return final_dataset
        

In [None]:
data_path = '../data/Apify_Indeed_job_postings/spacy/job_descriptions_for_classifier.jsonl'
data = read_tagged_job_descriptions(data_path)
hf_data  = make_data_for_hf(data)

train_dataset = hf_data["train"]
valid_dataset = hf_data["validation"]
test_dataset = hf_data["test"]

id2label, label2id = make_label_dicts()

In [None]:
train_dataset

In [None]:
BATCH_SIZE = 8
NUM_PROCS = 1
LR = 0.00005
EPOCHS = 2
MODEL = 'bert-base-uncased'
OUT_DIR = 'jd_classifier_bert'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
    )

In [None]:
def preprocess_function(batch):
    label2id = {"ROLE":0,"COMPA":1,"COMPENBEN":2}
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    tokenized_batch["label"] = [label2id[label] for label in batch["label"]]
    return tokenized_batch

In [None]:
accuracy = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)
 
tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)
 
tokenized_test = test_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)

In [None]:
tokenized_test

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to='tensorboard',
    fp16=False
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
 
history = trainer.train()

In [None]:
trainer.evaluate(tokenized_test)

In [None]:
model_path = "../models/jd_model1-"+MODEL
trainer.save_model(model_path)

In [None]:
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

In [None]:
result = classify("Experience with Python")
result

In [None]:
result = classify("We are commited to diversity")
result

In [None]:
result = classify("Benefits include dental")
result