In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Load CSV data
def load_csv(file_path):
    return pd.read_csv(file_path)

# Preprocess text
def preprocess_text(text):
    return "[CLS] " + text + " [SEP]"

# Tokenize and encode text
def encode_text(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    return torch.tensor(indexed_tokens).unsqueeze(0)  # Add batch dimension

# Match job descriptions based on prompt
def match_job_descriptions(csv_data, prompt):
    prompt_encoded = encode_text(preprocess_text(prompt))
    prompt_embedding = model(prompt_encoded)[0].detach().numpy()[0]
    similarities = []
    for index, row in csv_data.iterrows():
        description = row['job_description']
        description_encoded = encode_text(preprocess_text(description))
        description_embedding = model(description_encoded)[0].detach().numpy()[0]
        similarity = cosine_similarity([prompt_embedding], [description_embedding])[0][0]
        similarities.append((row['job_id'], similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities

# Split data into train, eval, and test sets
def split_data(data, test_size=0.2, eval_size=0.2):
    train_data, test_data = train_test_split(data, test_size=test_size)
    train_data, eval_data = train_test_split(train_data, test_size=eval_size)
    return train_data, eval_data, test_data

# Calculate accuracy
def calculate_accuracy(predictions, labels):
    return accuracy_score(labels, predictions)

# Example usage
if __name__ == "__main__":
    # Load CSV data
    csv_data = load_csv("job_descriptions.csv")

    # Sample prompt
    prompt = "Data scientist with expertise in machine learning and Python"

    # Split data into train, eval, and test sets
    train_data, eval_data, test_data = split_data(csv_data)

    # Match job descriptions based on the prompt
    matched_jobs_train = match_job_descriptions(train_data, prompt)
    matched_jobs_eval = match_job_descriptions(eval_data, prompt)
    matched_jobs_test = match_job_descriptions(test_data, prompt)

    # Display top matched jobs
    print("Top matched jobs on training set:")
    for job_id, similarity in matched_jobs_train[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")

    print("Top matched jobs on evaluation set:")
    for job_id, similarity in matched_jobs_eval[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")

    print("Top matched jobs on testing set:")
    for job_id, similarity in matched_jobs_test[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")

    # Calculate accuracy on the test set (for demonstration purposes)
    # In reality, you would need labeled data to calculate accuracy
    test_predictions = [1 if similarity > 0.5 else 0 for _, similarity in matched_jobs_test]
    test_labels = [1] * len(test_predictions)  # Example labels, replace with actual labels
    accuracy = calculate_accuracy(test_predictions, test_labels)
    print(f"Accuracy on the test set: {accuracy}")


In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Load CSV data
def load_csv(file_path):
    return pd.read_csv(file_path)

# Preprocess text
def preprocess_text(text):
    return "[CLS] " + text + " [SEP]"

# Tokenize and encode text
def encode_text(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    return torch.tensor(indexed_tokens).unsqueeze(0)  # Add batch dimension

# Match job descriptions based on prompt
def match_job_descriptions(csv_data, prompt):
    prompt_encoded = encode_text(preprocess_text(prompt))
    with torch.no_grad():
        prompt_embedding = model(prompt_encoded)[0][:, 0, :].numpy()
    similarities = []
    for index, row in csv_data.iterrows():
        description = row['job_description']
        description_encoded = encode_text(preprocess_text(description))
        with torch.no_grad():
            description_embedding = model(description_encoded)[0][:, 0, :].numpy()
        similarity = cosine_similarity(prompt_embedding, description_embedding)[0][0]
        similarities.append((row['job_id'], similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities

# Split data into train, eval, and test sets
def split_data(data, test_size=0.2, eval_size=0.2):
    train_data, test_data = train_test_split(data, test_size=test_size)
    train_data, eval_data = train_test_split(train_data, test_size=eval_size)
    return train_data, eval_data, test_data

# Example usage
if __name__ == "__main__":
    # Load CSV data
    csv_data = load_csv("job_descriptions.csv")

    # Sample prompt
    prompt = "Data scientist with expertise in machine learning and Python"

    # Split data into train, eval, and test sets
    train_data, eval_data, test_data = split_data(csv_data)

    # Match job descriptions based on the prompt
    matched_jobs_train = match_job_descriptions(train_data, prompt)
    matched_jobs_eval = match_job_descriptions(eval_data, prompt)
    matched_jobs_test = match_job_descriptions(test_data, prompt)

    # Display top matched jobs
    print("Top matched jobs on training set:")
    for job_id, similarity in matched_jobs_train[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")

    print("Top matched jobs on evaluation set:")
    for job_id, similarity in matched_jobs_eval[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")

    print("Top matched jobs on testing set:")
    for job_id, similarity in matched_jobs_test[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")
