In [2]:
! pip install pandas langchain langchain-google-genai sentence-transformers tiktoken

^C


Collecting langchain
  Using cached langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-google-genai
  Using cached langchain_google_genai-2.1.5-py3-none-any.whl.metadata (5.2 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting langchain-core<1.0.0,>=0.3.58 (from langchain)
  Using cached langchain_core-0.3.62-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Using cached langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain-google-genai)
  Using cached google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
Using cached langchain-0.3.25-py3-none-any.whl (1.0 MB)
Using cached langchain_core-0.3.62-py3-none-any

In [3]:
import os
import getpass

api_key = getpass.getpass(prompt="Insert your GOOGLE_API_KEY: ")


In [8]:
import getpass
from langchain_google_genai import ChatGoogleGenerativeAI
import tiktoken

# LLM configuration function
def config_llm_gemini(api_key: str, temperature: float = 0.7, model_name: str = "gemini-2.0-flash") -> ChatGoogleGenerativeAI:
    return ChatGoogleGenerativeAI(
        model=model_name,
        temperature=temperature,
        credentials={"api_key": api_key}
    )

llm = config_llm_gemini(api_key=api_key, temperature=0.7)

total_tokens = 0

# Usess encoding p50k_base with aproximation for Gemini
def count_tokens(text: str) -> int:
    enc = tiktoken.get_encoding('p50k_base')
    return len(enc.encode(text))

def validate_score(score: float) -> float:
    try:
        val = float(score)
    except:
        return 0.0
    if val < -5:
        return -5.0
    if val > 10:
        return 10.0
    return val

from langchain import PromptTemplate, LLMChain

def run_chain_and_count(llm, template: PromptTemplate, inputs: dict) -> str:
    global total_tokens
    prompt = template.format(**inputs)
    total_tokens += count_tokens(prompt)
    resp = LLMChain(llm=llm, prompt=template).run(inputs)
    total_tokens += count_tokens(resp)
    return resp.strip()

# Skills scoring
skills_template = PromptTemplate(
    input_variables=["skills", "job_desc"],
    template=(
        "Given the candidate's skills: {skills}\n"
        "And the job description: {job_desc}\n"
        "Rate how well the skills match the job on a scale of -5 to 10. Respond with just the number."
    )
)

def score_skills(llm, skills: str, job_desc: str) -> float:
    raw = run_chain_and_count(llm, skills_template, {"skills": skills, "job_desc": job_desc})
    return validate_score(raw)


# Education scoring
edu_template = PromptTemplate(
    input_variables=["education", "job_desc"],
    template=(
        "Given the candidate's education details: {education}\n"
        "- Consider university ranking and course relevance.\n"
        "- Award extra points for high GPA, but do not penalize low GPA.\n"
        "And the job description: {job_desc}\n"
        "Rate the education fit on a scale of -5 to 10. Respond with just the number."
    )
)

def score_education(llm, education: str, job_desc: str) -> float:
    raw = run_chain_and_count(llm, edu_template, {"education": education, "job_desc": job_desc})
    return validate_score(raw)

# Experience scoring
exp_template = PromptTemplate(
    input_variables=["experience", "job_desc"],
    template=(
        "Given the candidate's experience: {experience}\n"
        "And the job description: {job_desc}\n"
        "Rate how well the professional experience matches the job requirements on a scale of -5 to 10. Respond with just the number."
    )
)

def score_experience(llm, experience: str, job_desc: str) -> float:
    raw = run_chain_and_count(llm, exp_template, {"experience": experience, "job_desc": job_desc})
    return validate_score(raw)
    
# Summary scoring
summary_template = PromptTemplate(
    input_variables=["summary", "job_desc"],
    template=(
        "Given the candidate's summary text: {summary}\n"
        "And the job description: {job_desc}\n"
        "Rate how well the summary aligns with the role on a scale of -5 to 10. Respond with just the number."
    )
)
    
def score_summary(llm, summary: str, job_desc: str) -> float:
    raw = run_chain_and_count(llm, summary_template, {"summary": summary, "job_desc": job_desc})
    return validate_score(raw)

# Weighted sum of the scores
weights = {"skills": 1.0, "education": 1.0, "experience": 1.0, "summary": 1.0}
Total_w = sum(weights.values())

def final_score(row, desc: str) -> float:
    sk = score_skills(llm, row['skills'], desc)
    ed = score_education(llm, row['education'], desc)
    ex = score_experience(llm, row['experience'], desc)
    sm = score_summary(llm, row['summary'], desc)
    return (weights['skills']*sk + weights['education']*ed + weights['experience']*ex + weights['summary']*sm) / total_w  


In [None]:
import pandas as pd

# 6. Função de teste com exemplos hipotéticos
def run_example():
    # Cria CSVs de exemplo
    example_profiles = [
        {"summary": "Data analyst with financial modeling and visualization expertise.",
         "education": "University of Oxford (Ranked 5), MSc in Finance, GPA: 3.9",
         "experience": "4 years at FinAnalytics LLC focusing on dashboard development.",
         "skills": "Python; SQL; Tableau; Power BI"},
        {"summary": "Backend engineer with microservices architecture background.",
         "education": "Stanford University (Ranked 2), BSc in Computer Science, GPA: 4.0",
         "experience": "3 years at CloudNative Inc building scalable APIs.",
         "skills": "Java; Spring Boot; Docker; Kubernetes"}
    ]
    pd.DataFrame(example_profiles).to_csv('profiles_preprocessed.csv', index=False)
    example_job = [{"description_text": "Seeking Senior Data Analyst with strong Python, SQL and BI tool experience to join finance team."}]
    pd.DataFrame(example_job).to_csv('job_1_description.csv', index=False)
    # Gera descrições dummy para as restantes vagas
    for i in range(2, 6):
        pd.DataFrame(example_job).to_csv(f'job_{i}_description.csv', index=False)
    # Executa pipeline
    for job, df in job_dfs.items():
        desc = df.loc[0, 'description_text']
        profiles_df[f'final_score_{job}'] = profiles_df.apply(lambda r: final_score(r, desc), axis=1)
    profiles_df.to_csv('profiles_scored.csv', index=False)
    print("Resultados de exemplo:")
    print(profiles_df)
    print(f"Total tokens usados: {total_tokens}")

# 7. Executar exemplo para verificar funcionalidade
run_example()

NameError: name 'pd' is not defined