# Fine Tuning Generative AI

## Instalando dependencias

In [None]:
%pip install -r ../../requirements.txt

## Importando bibliotecas e configurações iniciais

In [None]:
import os, json, math
from pathlib import Path
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
import random
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from transformers import (
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, TaskType
import torch
                

# Paths
DATA_DIR = Path("../data/raw")
OUT_DIR = Path("../data")
OUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR = Path("../models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# CSV file paths
PATH_RESUMES = DATA_DIR / "resumes.csv"
PATH_JOBS = DATA_DIR / "jobs.csv"

# Saídas
PROCESSED_DIR = OUT_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

print(
    "Paths set. Resume exists?",
    PATH_RESUMES.exists(),
    " Jobs exists?",
    PATH_JOBS.exists(),
)

Paths set. Resume exists? True  Jobs exists? True


## Criando funções

In [None]:
def read_csv_safe(path):
    import pandas as pd

    try:
        return pd.read_csv(path)
    except Exception as e1:
        try:
            return pd.read_csv(path, encoding="latin1")
        except Exception as e2:
            raise RuntimeError(f"Erro ao ler {path}: {e1} | {e2}")


def find_column(df, candidates):
    """Procura coluna por nome exato ou substring."""
    # match exato
    for c in candidates:
        if c in df.columns:
            return c
    # match parcial
    lc = [col.lower() for col in df.columns]
    for c in candidates:
        for i, col in enumerate(lc):
            if c.lower() in col:
                return df.columns[i]
    return None


def safe_get_str(row, col_name):
    """
    Retorna string limpa do row[col_name] ou '' se coluna não existir / NaN.
    """
    if not col_name or col_name not in row.index:
        return ""
    val = row[col_name]
    if isinstance(val, float) and pd.isna(val):
        return ""
    try:
        s = str(val)
    except Exception:
        s = ""
    return s.strip()


def build_job_text(row, col_title, col_desc, col_skills):
    """Versão simples do construtor de texto para jobs."""
    title = safe_get_str(row, col_title)
    desc = safe_get_str(row, col_desc)
    skills = safe_get_str(row, col_skills)

    combined = f"JOB_TITLE:\n{title}\n\nJOB_DESCRIPTION:\n{desc}\n\nSKILLS:\n{skills}"
    return combined.strip()


def build_resume_text(row, col_resume, col_summary, col_skills):
    """Versão simples do construtor de texto para resumes."""
    resume = safe_get_str(row, col_resume)
    summary = safe_get_str(row, col_summary)
    skills = safe_get_str(row, col_skills)

    combined = f"RESUME:\n{resume}\n\nSUMMARY:\n{summary}\n\nSKILLS:\n{skills}"
    return combined.strip()


def compose_resume_text(row, col_resume, col_summary, col_skills):
    """Versão flexível do construtor (ignora campos vazios)."""
    resume = safe_get_str(row, col_resume)
    summary = safe_get_str(row, col_summary)
    skills = safe_get_str(row, col_skills)

    parts = []
    if resume:
        parts.append(f"RESUME:\n{resume}")
    if summary:
        parts.append(f"SUMMARY:\n{summary}")
    if skills:
        parts.append(f"SKILLS:\n{skills}")

    return "\n\n".join(parts).strip()


def compose_job_text(row, col_title, col_desc, col_skills):
    """Versão flexível do construtor (ignora campos vazios)."""
    title = safe_get_str(row, col_title)
    desc = safe_get_str(row, col_desc)
    skills = safe_get_str(row, col_skills)

    parts = []
    if title:
        parts.append(f"JOB_TITLE:\n{title}")
    if desc:
        parts.append(f"JOB_DESCRIPTION:\n{desc}")
    if skills:
        parts.append(f"SKILLS:\n{skills}")

    return "\n\n".join(parts).strip()


def make_pairs_from_row(row):
    src = row["source"]
    txt = row["text"]
    if src == "resume":
        instr = "Estruture e resuma este currículo em formato profissional (Título, Resumo Profissional, Experiência em bullets, Skills)."
        inp = txt
        out = txt  # inicial: usamos o conteúdo bruto como saída de referência (ponto de partida)
    else:
        instr = "Gere um currículo sugerido e um resumo profissional adaptado a esta vaga. Produza: 1) Um resumo profissional de 2-3 linhas, 2) 4 bullets de experiência simulada (foco nas skills), 3) lista de skills para incluir."
        inp = txt
        # gerar um output sintético a partir do job text (pequeno template)
        # extrai título e skills (simples)
        lines = txt.splitlines()
        title = ""
        skills = ""
        if len(lines) > 0:
            title = lines[0]
        # tenta achar a linha 'SKILLS:' e pegar depois dela
        if "SKILLS:" in txt:
            try:
                skills = txt.split("SKILLS:")[1].strip().split("\n")[0]
            except:
                skills = ""
        out = f"Título: {title}\n\nResumo: Profissional com experiência relevante para {title}. Habilidades-chave: {skills}\n\nExperiência:\n- Contribuiu em projetos usando {skills}\n- ... (preencha após revisão)\n\nSkills: {skills}"
    return {"instruction": instr, "input": inp, "output": out}

## Carregando dataframes de curriculos


In [3]:
df_resumes = pd.read_csv("../data/raw/resumes.csv")
df_resumes

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR
...,...,...,...,...
2479,99416532,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
2480,24589765,"GOVERNMENT RELATIONS, COMMUNICATIONS ...","<div class=""fontsize fontface vmargins hmargin...",AVIATION
2481,31605080,GEEK SQUAD AGENT Professional...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
2482,21190805,PROGRAM DIRECTOR / OFFICE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION


## Carregando dataframe de jobs descriptions


In [4]:
df_jobs = pd.read_csv("../data/raw/jobs.csv")
df_jobs

Unnamed: 0,id,title,company,location,link,source,date_posted,work_type,employment_type,description
0,1,Data Analyst,Meta,"New York, NY",https://www.linkedin.com/jobs/view/data-analys...,LinkedIn,2025-04-14,,,The Social Measurement team is a growing team ...
1,2,Data Analyst,Meta,"San Francisco, CA",https://www.linkedin.com/jobs/view/data-analys...,LinkedIn,2025-04-14,,,The Social Measurement team is a growing team ...
2,3,Data Analyst,Meta,"Los Angeles, CA",https://www.linkedin.com/jobs/view/data-analys...,LinkedIn,2025-04-14,,,The Social Measurement team is a growing team ...
3,4,Data Analyst,Meta,"Washington, DC",https://www.linkedin.com/jobs/view/data-analys...,LinkedIn,2025-04-14,,,The Social Measurement team is a growing team ...
4,5,Data Analyst II,Pinterest,"Chicago, IL",https://www.linkedin.com/jobs/view/data-analys...,LinkedIn,2025-04-16,,,About Pinterest\n\nMillions of people around t...
...,...,...,...,...,...,...,...,...,...,...
1043,2521,Senior GIS/Data Management Analyst,"Carollo Engineers, Inc.","Arlington, VA, US",https://www.indeed.com/viewjob?jk=d21d387bd349...,indeed,2025-06-10,,,Overview:\n\nCarollo Engineers is an internati...
1044,2531,Data Science Senior Associate - Card Data & An...,JPMorganChase,"Wilmington, DE, US",https://www.indeed.com/viewjob?jk=80450a9ec23a...,indeed,2025-06-11,,,**JOB DESCRIPTION** \n\nJoin our Card Data \&...
1045,2532,Vice President Data Scientist Marketing Analyt...,JPMorganChase,"Columbus, OH, US",https://www.indeed.com/viewjob?jk=96df9a9fdbad...,indeed,2025-06-11,,,**JOB DESCRIPTION** \n\nWe’re driven by curio...
1046,2533,Lead Data Scientist - Finance Technology,JPMorganChase,"Jersey City, NJ, US",https://www.indeed.com/viewjob?jk=d25ce3283b43...,indeed,2025-06-11,,,**JOB DESCRIPTION** \n\nJoin our Finance Tech...


## Limpeza dos datasets e criando coluna text

In [5]:
resume_candidates = {
    "resume": ["Resume","resume","Resume_str","Resume_text","Text","text","cv","Curriculum","resume_string"],
    "summary": ["Summary","summary","Objective","objective","profile","headline","about"],
    "skills": ["Skills","skills","Key Skills","key_skills","skills_list","Skills/Tools"]
}

job_candidates = {
    "title": ["Title","Job Title","Position","position","title"],
    "description": ["Description","Job Description","Full Description","description","job_description","details","full_description"],
    "skills": ["Skills","skills","Required Skills","requirements","keywords","skills_list"]
}

# ----- Detecta colunas nos dataframes (variáveis ficarão definidas) -----
col_resume = find_column(df_resumes, resume_candidates["resume"])
col_summary = find_column(df_resumes, resume_candidates["summary"])
col_resume_skills = find_column(df_resumes, resume_candidates["skills"])

col_job_title = find_column(df_jobs, job_candidates["title"])
col_job_desc  = find_column(df_jobs, job_candidates["description"])
col_job_skills= find_column(df_jobs, job_candidates["skills"])

print("Detectadas (resumes):", col_resume, "|", col_summary, "|", col_resume_skills)
print("Detectadas (jobs)   :", col_job_title, "|", col_job_desc, "|", col_job_skills)

df_resumes = df_resumes.fillna("")
df_jobs    = df_jobs.fillna("")

# criar coluna text (usa as colunas detectadas; se None, safe_get_str retorna "")
df_resumes["text"] = df_resumes.apply(
    lambda r: compose_resume_text(r, col_resume, col_summary, col_resume_skills),
    axis=1
)
df_jobs["text"] = df_jobs.apply(
    lambda r: compose_job_text(r, col_job_title, col_job_desc, col_job_skills),
    axis=1
)

# adicionar source
df_resumes["source"] = "resume"
df_jobs["source"] = "job"

# manter apenas 'text' e 'source' (se existirem)
for df_name, df in [("resumes", df_resumes), ("jobs", df_jobs)]:
    if "text" not in df.columns:
        raise RuntimeError(f"'text' não foi criado no dataframe {df_name}")

df_resumes_small = df_resumes[["text", "source"]].copy()
df_jobs_small    = df_jobs[["text", "source"]].copy()

# garantir strings e remover linhas vazias
df_resumes_small["text"] = df_resumes_small["text"].astype(str)
df_jobs_small["text"]    = df_jobs_small["text"].astype(str)

before_r = len(df_resumes_small)
before_j = len(df_jobs_small)

df_resumes_small = df_resumes_small[df_resumes_small["text"].str.strip() != ""].reset_index(drop=True)
df_jobs_small    = df_jobs_small[df_jobs_small["text"].str.strip() != ""].reset_index(drop=True)

after_r = len(df_resumes_small)
after_j = len(df_jobs_small)

print(f"Resumes: {before_r} -> {after_r} (linhas válidas) | Jobs: {before_j} -> {after_j} (linhas válidas)")


Detectadas (resumes): Resume_str | None | None
Detectadas (jobs)   : title | description | None
Resumes: 2484 -> 2483 (linhas válidas) | Jobs: 1048 -> 1048 (linhas válidas)


## Concatenar corretamente e criar dataset HF

In [6]:

# concat with ignore_index to avoid index issues
combined_df = pd.concat([df_resumes_small, df_jobs_small], ignore_index=True).reset_index(drop=True)

# drop empty texts
combined_df = combined_df[combined_df["text"].str.strip() != ""].reset_index(drop=True)

print("Combined shape:", combined_df.shape)
display(combined_df.sample(5))

# Convert to HF dataset
hf_dataset = Dataset.from_pandas(combined_df)
dataset_dict = hf_dataset.train_test_split(test_size=0.1, seed=RANDOM_SEED)  # 90/10 split
print(dataset_dict)

Combined shape: (3531, 2)


Unnamed: 0,text,source
2742,"JOB_TITLE:\nData Engineer, Product Analytics\n...",job
1042,RESUME:\nSALES ASSOCIATE Summary Moti...,resume
3239,JOB_TITLE:\n研究所-数据岗\n\nJOB_DESCRIPTION:\n职位来源于...,job
826,RESUME:\nRECREATION & SPORTS COORDINATOR ...,resume
586,RESUME:\nINTERNATIONAL BUSINESS DEVELOPMENT MA...,resume


DatasetDict({
    train: Dataset({
        features: ['text', 'source'],
        num_rows: 3177
    })
    test: Dataset({
        features: ['text', 'source'],
        num_rows: 354
    })
})


## Criar pares instruction/input/output para fine-tuning instrucional

In [7]:
# Map dataset to instruction pairs
pairs = []
for i, row in combined_df.iterrows():
    pairs.append(make_pairs_from_row(row))
    if (
        i >= 5000
    ):  # limite inicial (evita gastar memória se dataset muito grande) - ajuste conforme necessidade
        break

print("Pairs created:", len(pairs))
# Convert to HF dataset
pairs_df = pd.DataFrame(pairs)
pairs_ds = Dataset.from_pandas(pairs_df.reset_index(drop=True))
# split
pairs_dict = pairs_ds.train_test_split(test_size=0.1, seed=RANDOM_SEED)
print(pairs_dict)

Pairs created: 3531
DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 3177
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 354
    })
})


## Tokenizer + Preprocess (para Flan-T5 seq2seq)

In [8]:
MODEL_NAME = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

max_input_length = 512
max_target_length = 256


def preprocess_function(batch):
    instructions = batch["instruction"]
    inputs = batch["input"]
    outputs = batch["output"]

    prompts = [
        f"Instruction: {i}\nInput: {inp}\n\nResponse:"
        for i, inp in zip(instructions, inputs)
    ]

    model_inputs = tokenizer(
        prompts, max_length=max_input_length, truncation=True, padding="max_length"
    )

    labels = tokenizer(
        outputs, max_length=max_target_length, truncation=True, padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = pairs_dict.map(
    preprocess_function, batched=True, remove_columns=pairs_dict["train"].column_names
)

tokenized

Map: 100%|██████████| 3177/3177 [00:04<00:00, 748.14 examples/s]
Map: 100%|██████████| 354/354 [00:00<00:00, 667.28 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3177
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 354
    })
})

## Carregar modelo, preparar LoRa(PEFT) e treinar

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"--> Treinando no dispositivo: {device.upper()}")

if device == "cpu":
    print("AVISO: Treinar na CPU é lento. Para usar sua RX 6600 no Windows,")
    print("considere instalar o 'torch-directml' no futuro.")

# Desativamos 8-bit pois bitsandbytes
use_8bit = False

# Configuração de carregamento
load_kwargs = {"low_cpu_mem_usage": True}
# Se for CPU, forçamos float32 para estabilidade. Se fosse GPU NVIDIA, usaria float16.
load_kwargs["torch_dtype"] = torch.float32

print("--> Carregando modelo...")
try:
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, **load_kwargs)
except Exception as e:
    print(f"Erro ao carregar com kwargs, tentando padrão. Erro: {e}")
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,  # Uso do Enum é mais seguro
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Mostra quantos % do modelo serão treinados

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

train_args_dict = {
    "output_dir": str(OUT_DIR / "flan_t5_lora_outputs"),
    "per_device_train_batch_size": 2,
    "per_device_eval_batch_size": 2,
    "gradient_accumulation_steps": 8,
    "dataloader_num_workers": 6,
    "logging_steps": 10,
    "num_train_epochs": 3,
    "learning_rate": 2e-4,
    "fp16": False,
    "use_cpu": (
        True if device == "cpu" else False
    ), 
    "save_total_limit": 2,
    "group_by_length": True,
    "remove_unused_columns": True,
}

# Cria o objeto de argumentos
training_args = TrainingArguments(**train_args_dict)

# Ativa geração apenas se quiser ver métricas de texto (BLEU/ROUGE),
setattr(training_args, "predict_with_generate", True)

if not hasattr(training_args, "generation_config"):
    training_args.generation_config = None

# --- 6. INICIAR O TREINADOR ---
print("--> Iniciando Trainer...")
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized.get("test"),
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Treinar
trainer.train()

print("--> Salvando modelo final...")
final_outdir = MODEL_DIR / "resume_job_finetuned_model"
final_outdir.mkdir(parents=True, exist_ok=True)

try:
    # Salva o adaptador LoRA
    model.save_pretrained(str(final_outdir))
    tokenizer.save_pretrained(str(final_outdir))
    print(f"Modelo salvo com sucesso em: {final_outdir}")
except Exception as e:
    print(f"Erro ao salvar com save_pretrained: {e}")
    trainer.save_model(str(final_outdir))
    tokenizer.save_pretrained(str(final_outdir))

`torch_dtype` is deprecated! Use `dtype` instead!


--> Treinando no dispositivo: CPU
AVISO: Treinar na CPU é lento. Para usar sua RX 6600 no Windows,
considere instalar o 'torch-directml' no futuro.
--> Carregando modelo...
trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561
--> Iniciando Trainer...


  trainer = Seq2SeqTrainer(


Step,Training Loss
10,9.8592
20,7.6183
30,8.6958
40,6.345
50,3.3819
60,1.9539
70,1.5541
80,1.4555
90,1.3051
100,1.1487


--> Salvando modelo final...
Modelo salvo com sucesso em: ..\models\resume_job_finetuned_model


## Testando o modelo

In [22]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from peft import PeftModel, PeftConfig
from pathlib import Path

# Caminho do modelo LoRA treinado
model_path = MODEL_DIR / "resume_job_finetuned_model"
model_path = Path(model_path)

print(f"Tentando carregar modelo de: {model_path}")

peft_config = PeftConfig.from_pretrained(str(model_path))

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    peft_config.base_model_name_or_path,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
)

model = PeftModel.from_pretrained(
    base_model,
    str(model_path),
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(str(model_path))

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
)

instruction = """
Instruction: Gere um perfil profissional extenso, aprofundado e extremamente detalhado, com no mínimo 6 parágrafos completos. O texto deve explorar profundamente a trajetória, experiências, conquistas, domínio técnico, metodologias, impacto nos projetos, dificuldades enfrentadas e resultados alcançados. Estruture a resposta em múltiplas áreas, expandindo cada detalhe ao máximo.

Inclua obrigatoriamente:
- Visão geral da carreira e histórico profissional
- Proficiência técnica detalhada, incluindo frameworks, ferramentas e práticas modernas
- Experiências práticas em projetos, com exemplos e impactos reais
- Metodologias e processos dominados (Scrum, CI/CD, Clean Code etc.)
- Soft skills com narrativa aprofundada
- Contribuições únicas, estilo de trabalho e diferenciais
- Potencial futuro do candidato e possíveis posições adequadas

Input: Candidato com experiência em React, Node.js, automação de testes e SQL.
Response:
"""
input_data = "Candidato com experiência em React, Node.js, automação de testes e SQL."

# O PROMPT DEVE SEGUIR O PADRÃO DO TREINAMENTO
prompt = f"""Instruction: {instruction}
Input: {input_data}
Response:"""

print("\n--- Gerando Resposta ---\n")

outputs = pipe(
    prompt,
    max_new_tokens=800,
    min_new_tokens=200,
    do_sample=True,
    temperature=0.9,
    top_k=50,
    top_p=0.95,
    early_stopping=False,
    repetition_penalty=1.1,
    no_repeat_ngram_size=4,
    forced_bos_token_id=None,
    forced_eos_token_id=None,
)

# Em modelos seq2seq, o texto completo inclui o prompt,
# então pegamos apenas a parte DEPOIS de "Response:"
generated = outputs[0]["generated_text"]

if "Response:" in generated:
    generated = generated.split("Response:")[-1].strip()

print("\n🟩 RESPOSTA GERADA:\n")
print(generated)

Tentando carregar modelo de: ..\models\resume_job_finetuned_model


Device set to use cpu



--- Gerando Resposta ---


🟩 RESPOSTA GERADA:

Ttulo: Profissional expanso, aprofundado e extremamente detalhado, com no mnimo 6 parágrafos completos. O texto deve explorar profundamente a traiória, experiências, conquistas, domnio técnico, metodologias, impacto nos projetos, dificuldades enfrentades e resultados alcançados. Estrutura a resposta em mltiplas áreas, expandindo cada detalhe ao máximo. Inclua obrigatoriamente: - Viso geral da carreira e histórico profissional - Proficiência técnica detalhada, incluindo frameworks, ferramentas e práticas modernas
