<a href="https://colab.research.google.com/github/SlipRiders/Resume-Screening-Bot/blob/main/finetunning/fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
!pip install transformers datasets




In [57]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

job_title_path = '/content/drive/MyDrive/info7375/job_title_des.csv'
resumes_path = '/content/drive/MyDrive/info7375/synthetic-resume-1.csv'
testset_path = '/content/drive/MyDrive/info7375/testset-1.csv'

job_title_data = pd.read_csv(job_title_path)
resumes_data = pd.read_csv(resumes_path)
testset_data = pd.read_csv(testset_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [58]:
merged_data = []

for _, job_row in job_title_data.iterrows():
    for _, resume_row in resumes_data.iterrows():
        merged_data.append({
            'Job Description': job_row['Job Description'],
            'Resume': resume_row['Resume']
        })

train_data = pd.DataFrame(merged_data)

In [64]:
sampled_train_data = train_data.sample(n=2000, random_state=42)
sampled_train_data['labels'] = 0.0  # 使用浮点型标签
from datasets import Dataset
train_dataset = Dataset.from_pandas(sampled_train_data)


In [65]:
testset_data['labels'] = 0.0  # 为测试数据集添加标签列，假设所有标签为0.0
testset_data.rename(columns={'Ground Truth': 'Resume'}, inplace=True)
test_dataset = Dataset.from_pandas(testset_data[['Job Description', 'Resume', 'labels']])

load model and tokennizer

In [66]:
from transformers import AutoTokenizer, BertForSequenceClassification

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Write a preprocessing function and apply it to the dataset

In [67]:
def preprocess_function(examples):
    inputs = tokenizer(examples['Job Description'], examples['Resume'], truncation=True, padding='max_length')
    # 确保标签是浮点型
    inputs["labels"] = [float(label) for label in examples["labels"]]
    return inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [68]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
)

In [69]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'], device=device)

model.to(device)

Using CPU


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-1

In [70]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer
)

trainer.train()

Step,Training Loss
500,0.0


TrainOutput(global_step=500, training_loss=2.435661666095257e-05, metrics={'train_runtime': 3029.9056, 'train_samples_per_second': 0.66, 'train_steps_per_second': 0.165, 'total_flos': 66329253888000.0, 'train_loss': 2.435661666095257e-05, 'epoch': 1.0})

In [71]:
# 保存模型
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

# 将模型复制到Google Drive
!cp -r ./fine_tuned_model /content/drive/MyDrive/info7375