In [None]:
%pip install transformers
%pip install numpy
%pip install pandas
%pip install torch

In [None]:
from IPython.display import display_html
def restart_kernel():
    display_html("<script>Jupyter.notebook.kernel.restart()</script>", raw=True)

# Call the function to restart the kernel
restart_kernel()


In [4]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np

# Load pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def encode_text(text):
    """
    Tokenize and encode text using DistilBERT tokenizer
    """
    tokenized_text = tokenizer.encode(text, add_special_tokens=True)
    input_ids = torch.tensor([tokenized_text])
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs[0]
    return last_hidden_states.numpy()

def cosine_similarity(vec1, vec2):
    """
    Compute cosine similarity between two vectors
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

def keyword_matching_resume_job(resume, job_description, threshold=0.8):
    """
    Perform keyword matching between resume and job description
    """
    # Encode resume and job description
    resume_encoding = encode_text(resume)
    job_description_encoding = encode_text(job_description)
    
    # Compute cosine similarity between the two encodings
    similarity_score = cosine_similarity(resume_encoding, job_description_encoding)
    
    if similarity_score >= threshold:
        return True
    else:
        return False

# Example usage
resume = "Experienced software engineer with expertise in Python, Java, and machine learning."
job_description = "We are looking for a software engineer proficient in Python, Java, and machine learning."
match = keyword_matching_resume_job(resume, job_description)
if match:
    print("Resume matches job description.")
else:
    print("Resume does not match job description.")


ValueError: shapes (16,768) and (19,768) not aligned: 768 (dim 1) != 19 (dim 0)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data from CSV files
resume_data = pd.read_csv('resume_data.csv')
job_description_data = pd.read_csv('job_description_data.csv')

# Split data into training and evaluation sets
resume_train, resume_eval = train_test_split(resume_data, test_size=0.2, random_state=42)
job_description_train, job_description_eval = train_test_split(job_description_data, test_size=0.2, random_state=42)

# Fine-tune DistilBERT model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Prepare training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
)

# Define evaluation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=resume_train,
    eval_dataset=resume_eval,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model on job description data
eval_result = trainer.evaluate(eval_dataset=job_description_eval)

# Print performance metrics
print("Accuracy:", eval_result['eval_accuracy'])
print("Precision:", eval_result['eval_precision'])
print("Recall:", eval_result['eval_recall'])
print("F1-score:", eval_result['eval_f1'])


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data from CSV files
resume_data = pd.read_csv('resume_data.csv')
job_description_data = pd.read_csv('job_description_data.csv')

# Split data into training and evaluation sets
resume_train, resume_eval = train_test_split(resume_data, test_size=0.2, random_state=42)
job_description_train, job_description_eval = train_test_split(job_description_data, test_size=0.2, random_state=42)

# Fine-tune DistilBERT model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Prepare training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
)

# Define evaluation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=resume_train,
    eval_dataset=resume_eval,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Prepare evaluation dataset for job descriptions
eval_input_ids = tokenizer(job_description_eval['job_description'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Evaluate the model
eval_output = trainer.predict(eval_input_ids)

# Get predicted labels
pred_labels = torch.argmax(eval_output.predictions, axis=1).tolist()

# Print matching scores for matching resumes and job descriptions
for i, (resume_id, job_id, company, position_title) in enumerate(zip(job_description_eval['job_id'], job_description_eval['company'], job_description_eval['position_title'], job_description_eval['resume_str'])):
    if pred_labels[i] == 1:
        print(f"Resume ID: {resume_id}, Job ID: {job_id}, Company: {company}, Position Title: {position_title}, Matching Score: {eval_output.predictions[i][1]}")


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data from CSV files
resume_data = pd.read_csv('resume_data.csv')
job_description_data = pd.read_csv('job_description_data.csv')

# Split data into training and evaluation sets
resume_train, resume_eval = train_test_split(resume_data, test_size=0.2, random_state=42)
job_description_train, job_description_eval = train_test_split(job_description_data, test_size=0.2, random_state=42)

# Fine-tune DistilBERT model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Prepare training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
)

# Define evaluation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=resume_train,
    eval_dataset=resume_eval,
    compute_metrics=compute_metrics,
)

# Train the model and print performance metrics during training
trainer.train()
train_result = trainer.evaluate()
print("Training performance metrics:", train_result)

# Prepare evaluation dataset for job descriptions
eval_input_ids = tokenizer(job_description_eval['job_description'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Evaluate the model on job description evaluation dataset
eval_output = trainer.predict(eval_input_ids)
eval_result = compute_metrics(eval_output)
print("Evaluation performance metrics:", eval_result)

# Print matching scores for matching resumes and job descriptions
for i, (resume_id, job_id, company, position_title) in enumerate(zip(job_description_eval['job_id'], job_description_eval['company'], job_description_eval['position_title'], job_description_eval['resume_str'])):
    if eval_output.predictions[i][1] >= 0.5:
        print(f"Resume ID: {resume_id}, Job ID: {job_id}, Company: {company}, Position Title: {position_title}, Matching Score: {eval_output.predictions[i][1]}")
