In [None]:
import polars as pl
from tqdm.notebook import tqdm

In [None]:
def load_data(filepath):
    return pl.read_csv(filepath, separator=";")

In [None]:
cv_skills = load_data("Skills_cleaned.csv")
skills = (
    cv_skills.filter(pl.col("Skill_Type") == "Professional_Skill")["Skill"]
    .unique()
    .to_list()
)
skills

In [None]:
from huggingface_hub import login

login(token="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")

In [None]:
import re


def extract_label(block: str) -> str:
    """ """
    # 1) Provo a catturare fino a 100 caratteri dopo Answer:
    m = re.search(r"Answer:\s*([\s\S]{0,100})", block, flags=re.IGNORECASE)
    if m:
        snippet = m.group(1)
        # cerco Hard o Soft nel frammento
        m2 = re.search(r"\b(Hard|Soft)\b", snippet, flags=re.IGNORECASE)
        if m2:
            return m2.group(1).capitalize()

    return "Unknown"

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True
)


def batch_classify_skills(skills: list[str], batch_size: int = 256) -> list[str]:
    labels = []
    for i in tqdm(range(0, len(skills), batch_size)):
        batch = skills[i : i + batch_size]
        prompts = [
            f"""You are an expert HR assistant.
        Classify the following skill as Hard or Soft. Only output one word: Hard or Soft, and nothing else.

        Examples:
        Skill: "Data Management" -> Hard
        Skill: "Time Management" -> Soft
        Skill: "Python Programming" -> Hard
        Skill: "Team Leadership" -> Soft

        Now classify:
        Skill: "{skill}"
        Answer:"""
            for skill in batch
        ]
        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
        ).to(model.device)

        outputs = model.generate(
            **inputs, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id
        )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for text in decoded:
            print(text)
            label = extract_label(text)
            labels.append(label)
    return labels


hard_soft_labels = batch_classify_skills(skills, batch_size=256)
for skill, label in zip(skills, hard_soft_labels):
    print(f"{skill:25} -> {label}")

In [None]:
set(hard_soft_labels)

In [None]:
(
    hard_soft_labels.count("Hard"),
    hard_soft_labels.count("Soft"),
    hard_soft_labels.count("Unknown"),
)

In [None]:
for skill, label in zip(skills, hard_soft_labels):
    if label == "Soft":
        print(f"{skill:25} -> {label}")