In [1]:
pip install transformers datasets scikit-learn torch pandas


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [1]:
!pip install -U datasets



In [2]:
!pip install --upgrade datasets huggingface_hub fsspec transformers

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [3]:
from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
import torch
import re

# Load dataset
dataset = load_dataset("facehuggerapoorv/resume-jd-match")

# Preprocess: Combine Resume and JD
def preprocess(example):
    return {
        "text": f"Resume: {example['Resume']} JobDesc: {example['Job Description']}",
        "label": int(example['Label'])
    }

dataset = dataset.map(preprocess)
split = dataset["train"].train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]

# Tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Training setup
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune model
trainer.train()

# Load pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Resume fit prediction
def extract_keywords(text):
    words = re.findall(r'\b\w+\b', text.lower())
    stopwords = set(["the", "is", "and", "to", "of", "in", "a", "for", "on", "with", "at", "as", "by"])
    return [word for word in words if word not in stopwords and len(word) > 2]

def predict_fit(resume_text, jd_text):
    resume_keywords = extract_keywords(resume_text)
    jd_keywords = extract_keywords(jd_text)
    combined_text = f"Resume: {' '.join(resume_keywords)} JobDesc: {' '.join(jd_keywords)}"
    result = classifier(combined_text)[0]
    return "Fit" if result["label"] == "LABEL_1" else "Not Fit"

# Example
resume = "Experienced data scientist skilled in NLP, resume parsing, BERT, and model evaluation."
jobdesc = "Looking for an NLP engineer with experience in resume parsing and BERT models."

print("Prediction:", predict_fit(resume, jobdesc))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/411 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/29.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/8.07M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1759 [00:00<?, ? examples/s]

Map:   0%|          | 0/6241 [00:00<?, ? examples/s]

KeyError: 'Resume'

In [7]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import torch
import evaluate

# Load dataset
print("Loading dataset...")
dataset = load_dataset("facehuggerapoorv/resume-jd-match")
print("Dataset loaded.")

# --- Inspecting Labels ---
print("\nInspecting dataset labels before processing:")
print(f"Train dataset labels sample: {[dataset['train'][i]['label'] for i in range(min(5, len(dataset['train'])))]}")
print(f"Test dataset labels sample: {[dataset['test'][i]['label'] for i in range(min(5, len(dataset['test'])))]}")

# Define a clear label mapping for your 3 classes.
# Assuming 'Good Fit' also maps to 'Fit' for binary classification (num_labels=2).
# If you want 3 classes, set num_labels=3 and map 'Good Fit' to 2.
label_mapping = {
    'No Fit': 0,
    'Fit': 1,
    'Good Fit': 1 # Assuming 'Good Fit' also maps to 'Fit' for binary classification
}

# Load pretrained tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Tokenize and process labels (CORRECTED)
def preprocess(examples): # Renamed 'example' to 'examples' to signify batch
    # Tokenize the batch of texts
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding=True)

    # Process labels for the batch
    # examples["label"] is now a list of labels, e.g., ['No Fit', 'Fit', 'No Fit']
    processed_labels = []
    for label_str in examples["label"]:
        if label_str in label_mapping:
            processed_labels.append(label_mapping[label_str])
        else:
            # Handle cases where a label might not be in your map
            print(f"Warning: Unexpected label '{label_str}'. Assigning to 'No Fit' (0).")
            processed_labels.append(0) # Default to 'No Fit' if label is not recognized

    tokenized_inputs["labels"] = processed_labels
    return tokenized_inputs

print("\nMapping and tokenizing datasets...")
# Apply mapping to both train and test sets
dataset = dataset.map(preprocess, batched=True)
print("Mapping complete.")

# The `DataCollatorWithPadding` is still useful to ensure consistent batching,
# though `padding=True` in `preprocess` already handles sequence length.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Remove the original 'text' and 'label' columns as they are now processed
# and to prevent conflicts with 'input_ids', 'attention_mask', and 'labels'
dataset = dataset.remove_columns(["text", "label"])

# Load accuracy metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Ensure labels are torch.LongTensor for torch.argmax compatibility
    labels = torch.tensor(labels, dtype=torch.long)
    predictions = torch.argmax(torch.tensor(logits), dim=-1)

    return accuracy.compute(predictions=predictions, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=100,
    report_to="wandb",
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
print("\nStarting training...")
trainer.train()
print("Training complete.")

# === PREDICTION CODE ===
def check_resume_fit(resume_text, job_description):
    # Ensure consistency with how training data was combined
    combined_text = f"For the given job description <<{job_description}>> the resume: <<{resume_text}>>"

    # Ensure truncation and padding are used during inference
    inputs = tokenizer(combined_text, return_tensors="pt", truncation=True, padding=True)

    # Move inputs to the same device as the model if it's on GPU
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
        model.to('cuda')

    with torch.no_grad():
        outputs = model(**inputs)
        # Logits are raw scores, apply softmax to get probabilities
        probabilities = torch.softmax(outputs.logits, dim=-1)
        prediction_id = torch.argmax(probabilities, dim=-1).item()

    # Reverse map the prediction ID to the label name
    reverse_label_map = {v: k for k, v in label_mapping.items()}
    predicted_label = reverse_label_map.get(prediction_id, "Unknown")

    # If you want to return a binary result:
    if predicted_label == 'No Fit':
        return "Unfit Resume ❌"
    else: # This covers 'Fit' and 'Good Fit' based on our mapping
        return "Fit Resume ✅"


# === EXAMPLE USAGE ===
resume = "Experienced software engineer with skills in Python, NLP, and cloud computing."
job_description = "Looking for a backend developer with experience in Python and NLP."

print("\n--- Example Prediction ---")
result = check_resume_fit(resume, job_description)
print(result)

resume_unfit = "I am a chef with 10 years of experience in French cuisine."
job_description_unfit = "Seeking a data scientist with expertise in machine learning and SQL."
result_unfit = check_resume_fit(resume_unfit, job_description_unfit)
print(result_unfit)

resume_good_fit = "Senior AI researcher with PhD in deep learning and 5 years experience with PyTorch and TensorFlow."
job_description_good_fit = "Hiring a lead AI/ML engineer for R&D, strong deep learning and PyTorch skills required."
result_good_fit = check_resume_fit(resume_good_fit, job_description_good_fit)
print(result_good_fit)

Loading dataset...
Dataset loaded.

Inspecting dataset labels before processing:
Train dataset labels sample: ['No Fit', 'No Fit', 'No Fit', 'No Fit', 'No Fit']
Test dataset labels sample: ['No Fit', 'No Fit', 'No Fit', 'No Fit', 'No Fit']


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Mapping and tokenizing datasets...


Map:   0%|          | 0/6241 [00:00<?, ? examples/s]



Map:   0%|          | 0/1759 [00:00<?, ? examples/s]

Mapping complete.


  trainer = Trainer(



Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4735,0.776917,0.68448
2,0.3741,0.898693,0.723138
3,0.3627,0.978525,0.731097


Training complete.

--- Example Prediction ---
Unfit Resume ❌
Unfit Resume ❌
Unfit Resume ❌
