In [None]:
# Import required libraries

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
import random

# Download NLTK stopwords
nltk.download('stopwords')

# Step 1: Load datasets (resumes and job descriptions)
resumes = pd.read_csv("./dataSets/resume.csv")  # Ensure your resume data has a column 'Resume_str'
jobs = pd.read_csv("./dataSets/jobDescription.csv")  # Ensure your job description data has a column 'JobDescription'

# Step 2: Preprocessing text function (cleaning text)
def clean_text(text):
    text = re.sub(r'\W+', ' ', text.lower())  # Remove non-alphanumeric characters and lower the text
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply the clean_text function to both datasets
resumes['Resume_str'] = resumes['Resume_str'].apply(clean_text)
jobs['JobDescription'] = jobs['JobDescription'].apply(clean_text)

# Step 3: Define BERT model
class BertEmbeddings(Dataset):
    def __init__(self, resumes, jobs, tokenizer, max_len):
        self.resumes = resumes
        self.jobs = jobs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.resumes)

    def __getitem__(self, item):
        resume = self.resumes[item]
        job = self.jobs[item]
        
        inputs = self.tokenizer.encode_plus(
            resume,
            job,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten()
        }

# Step 4: Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Step 5: Create embeddings for resumes and job descriptions
MAX_LEN = 128  # Choose an appropriate max length for your input text
dataset = BertEmbeddings(resumes['Resume_str'].values, jobs['JobDescription'].values, tokenizer, MAX_LEN)

# Step 6: Get BERT embeddings
def get_bert_embeddings(dataset):
    model = BertModel.from_pretrained('bert-base-uncased')
    model.eval()
    embeddings = []
    with torch.no_grad():
        for data in DataLoader(dataset, batch_size=8):
            input_ids = data['input_ids']
            attention_mask = data['attention_mask']
            outputs = model(input_ids, attention_mask=attention_mask)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy())
    return np.vstack(embeddings)

# Generate embeddings for the dataset
resume_job_embeddings = get_bert_embeddings(dataset)

# Step 7: Generate training data
X = resume_job_embeddings
y = [random.randint(50, 100) for _ in range(len(resumes))]  # Generate dummy scores for now


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/devyadav_00/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Step 8: Train a model to predict the score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_squared_error

# Use 75% for training and 25% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train a Random Forest Regressor

regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)


In [3]:
# Step 9: Evaluate the model
from sklearn.metrics import mean_squared_error

predictions = regressor.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

# Step 10: Save the model
import joblib
joblib.dump(regressor, 'resume_score_model.pkl')

Mean Squared Error: 226.0449698443333


['resume_score_model.pkl']

In [4]:
import numpy as np

# Define a tolerance margin (e.g., predictions within ±10 points are considered "accurate")
tolerance = 10

correct = 0
total = len(y_test)

for actual, pred in zip(y_test, predictions):
    if abs(actual - pred) <= tolerance:
        correct += 1

accuracy = (correct / total) * 100
print(f"Resume Score Prediction Accuracy (±{tolerance}): {accuracy:.2f}%")


Resume Score Prediction Accuracy (±10): 37.20%


In [119]:
import numpy as np

def soft_accuracy(y_true, y_pred, tolerance=10):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    correct = np.sum(np.abs(y_true - y_pred) <= tolerance)
    total = len(y_true)
    accuracy = (correct / total) * 100
    return accuracy

# Example usage:
tolerance = 10
accuracy = soft_accuracy(y_test, predictions, tolerance)
print(f"Resume Score Prediction Accuracy (±{tolerance}): {accuracy:.2f}%")


Resume Score Prediction Accuracy (±10): 37.60%


In [120]:
def soft_precision_recall_f1(y_true, y_pred, tolerance=10):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # True Positives: prediction within tolerance of actual
    tp = np.sum(np.abs(y_true - y_pred) <= tolerance)

    # False Positives: prediction is within tolerance, but actual is far (simulate as incorrect)
    fp = np.sum((np.abs(y_true - y_pred) > tolerance) & (y_pred >= y_true))

    # False Negatives: prediction is far below the actual value
    fn = np.sum((np.abs(y_true - y_pred) > tolerance) & (y_pred < y_true))

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return precision, recall, f1

# Example usage:
precision, recall, f1 = soft_precision_recall_f1(y_test, predictions, tolerance=10)
print(f"Soft Precision (±10): {precision:.2f}")
print(f"Soft Recall (±10): {recall:.2f}")
print(f"Soft F1 Score (±10): {f1:.2f}")


Soft Precision (±10): 0.52
Soft Recall (±10): 0.57
Soft F1 Score (±10): 0.55
