In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
!pip install datasets

!pip install evaluate




In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification,
    GPT2Tokenizer, GPT2Config, GPT2ForSequenceClassification,
    Trainer, TrainingArguments
)
import torch
from torch.utils.data import Dataset


In [27]:
# Load dataset
file_path = '/content/drive/MyDrive/Resume (1).csv'  # Update this path as needed
resume_data = pd.read_csv(file_path)

# Combine relevant columns into a single text field
resume_data['Resume'] = (
    resume_data['Name'].astype(str) + " " +
    resume_data['Email'].astype(str) + " " +
    resume_data['Phone'].astype(str) + " " +
    resume_data['Skills'].astype(str) + " " +
    resume_data['Experience'].astype(str) + " " +
    resume_data['Education'].astype(str) + " " +
    resume_data['Designation'].astype(str)
)

# Limit the dataset size for efficiency
sample_size = min(500, len(resume_data))
resume_data = resume_data.sample(n=sample_size, random_state=42)

# Add labels (example: use 'Designation' as target)
resume_data['Label'] = resume_data['Designation'].astype('category').cat.codes

# Split dataset into training and testing sets
X = resume_data['Resume']
y = resume_data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
class ResumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [29]:
# Initialize tokenizer and datasets for RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_length = 128

roberta_train_dataset = ResumeDataset(X_train.tolist(), y_train.tolist(), roberta_tokenizer, max_length)
roberta_test_dataset = ResumeDataset(X_test.tolist(), y_test.tolist(), roberta_tokenizer, max_length)

# Load RoBERTa model
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(y.unique()))

# Training arguments for RoBERTa
roberta_training_args = TrainingArguments(
    output_dir='./roberta_results',
    eval_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./roberta_logs',
    logging_steps=10
)

# Trainer for RoBERTa
roberta_trainer = Trainer(
    model=roberta_model,
    args=roberta_training_args,
    train_dataset=roberta_train_dataset,
    eval_dataset=roberta_test_dataset
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
print("Training RoBERTa...")
roberta_trainer.train()

# Evaluate RoBERTa
roberta_predictions, roberta_labels, _ = roberta_trainer.predict(roberta_test_dataset)
roberta_logits = roberta_predictions
roberta_predicted_classes = np.argmax(roberta_logits, axis=-1)
roberta_accuracy = np.mean(roberta_predicted_classes == roberta_labels)

print("RoBERTa Evaluation Results:")
print(f"Accuracy: {roberta_accuracy}")


Training RoBERTa...


Epoch,Training Loss,Validation Loss
1,1.5368,1.312519


RoBERTa Evaluation Results:
Accuracy: 0.8


In [31]:
# Initialize tokenizer and datasets for GPT-2
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token  # Use EOS token as padding

gpt_train_dataset = ResumeDataset(X_train.tolist(), y_train.tolist(), gpt_tokenizer, max_length)
gpt_test_dataset = ResumeDataset(X_test.tolist(), y_test.tolist(), gpt_tokenizer, max_length)

# Configure GPT-2 with padding token
# Configure GPT-2 with padding token and set number of labels
gpt_config = GPT2Config.from_pretrained('gpt2')
gpt_config.pad_token_id = gpt_tokenizer.pad_token_id  # Set padding token
gpt_config.num_labels = len(y.unique())  # Set the number of labels for classification

# Initialize GPT-2 for sequence classification
gpt_model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=gpt_config)

# Training arguments for GPT-2
gpt_training_args = TrainingArguments(
    output_dir='./gpt_results',
    eval_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=1,  # Use batch size of 1 to avoid padding issues
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./gpt_logs',
    logging_steps=10
)

# Trainer for GPT-2
gpt_trainer = Trainer(
    model=gpt_model,
    args=gpt_training_args,
    train_dataset=gpt_train_dataset,
    eval_dataset=gpt_test_dataset,
    tokenizer=gpt_tokenizer
)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  gpt_trainer = Trainer(


In [32]:
print("Training GPT...")
gpt_trainer.train()

# Evaluate GPT-2
gpt_predictions, gpt_labels, _ = gpt_trainer.predict(gpt_test_dataset)
gpt_logits = gpt_predictions
gpt_predicted_classes = np.argmax(gpt_logits, axis=-1)
gpt_accuracy = np.mean(gpt_predicted_classes == gpt_labels)

print("GPT Evaluation Results:")
print(f"Accuracy: {gpt_accuracy}")


Training GPT...


Epoch,Training Loss,Validation Loss
1,0.0091,0.002924


GPT Evaluation Results:
Accuracy: 1.0


In [33]:
print("Summary of Results:")
print(f"RoBERTa Accuracy: {roberta_accuracy}")
print(f"GPT Accuracy: {gpt_accuracy}")


Summary of Results:
RoBERTa Accuracy: 0.8
GPT Accuracy: 1.0


In [34]:
# Fine-tuning RoBERTa
roberta_finetune_args = TrainingArguments(
    output_dir='./roberta_finetune_results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,  # Reduced learning rate for fine-tuning
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,  # More epochs for fine-tuning
    weight_decay=0.01,
    logging_dir='./roberta_finetune_logs',
    logging_steps=50,
    save_total_limit=2,  # Save only the last 2 checkpoints
    save_strategy="epoch"
)

roberta_finetune_trainer = Trainer(
    model=roberta_model,
    args=roberta_finetune_args,
    train_dataset=roberta_train_dataset,
    eval_dataset=roberta_test_dataset,
    tokenizer=roberta_tokenizer
)

print("Fine-tuning RoBERTa...")
roberta_finetune_trainer.train()

# Evaluate fine-tuned RoBERTa
print("Evaluating fine-tuned RoBERTa...")
roberta_predictions, roberta_labels, _ = roberta_finetune_trainer.predict(roberta_test_dataset)
roberta_logits = roberta_predictions
roberta_predicted_classes = np.argmax(roberta_logits, axis=-1)
roberta_accuracy = np.mean(roberta_predicted_classes == roberta_labels)

print("Fine-tuned RoBERTa Results:")
print(f"Accuracy: {roberta_accuracy}")


Fine-tuning RoBERTa...


  roberta_finetune_trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.264498
2,No log,0.081997
3,0.467200,0.052213


Evaluating fine-tuned RoBERTa...


Fine-tuned RoBERTa Results:
Accuracy: 1.0


In [35]:
# Fine-tuning GPT-2
gpt_finetune_args = TrainingArguments(
    output_dir='./gpt_finetune_results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,  # Reduced learning rate for fine-tuning
    per_device_train_batch_size=2,  # Slightly increased batch size
    per_device_eval_batch_size=2,
    num_train_epochs=1,  # More epochs for fine-tuning
    weight_decay=0.01,
    logging_dir='./gpt_finetune_logs',
    logging_steps=50,
    save_total_limit=2,  # Save only the last 2 checkpoints
    save_strategy="epoch"
)

gpt_finetune_trainer = Trainer(
    model=gpt_model,
    args=gpt_finetune_args,
    train_dataset=gpt_train_dataset,
    eval_dataset=gpt_test_dataset,
    tokenizer=gpt_tokenizer
)

print("Fine-tuning GPT...")
gpt_finetune_trainer.train()

# Evaluate fine-tuned GPT
print("Evaluating fine-tuned GPT...")
gpt_predictions, gpt_labels, _ = gpt_finetune_trainer.predict(gpt_test_dataset)
gpt_logits = gpt_predictions
gpt_predicted_classes = np.argmax(gpt_logits, axis=-1)
gpt_accuracy = np.mean(gpt_predicted_classes == gpt_labels)

print("Fine-tuned GPT Results:")
print(f"Accuracy: {gpt_accuracy}")


Fine-tuning GPT...


  gpt_finetune_trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,4.5e-05
2,0.002500,1.7e-05
3,0.001000,1.5e-05


Evaluating fine-tuned GPT...


Fine-tuned GPT Results:
Accuracy: 1.0
