In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Load CSV data
def load_csv(file_path):
    return pd.read_csv(file_path)

# Preprocess text
def preprocess_text(text):
    return "[CLS] " + text + " [SEP]"

# Tokenize and encode text
def encode_text(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    return torch.tensor(indexed_tokens).unsqueeze(0)  # Add batch dimension

# Match job descriptions based on prompt
def match_job_descriptions(csv_data, prompt):
    prompt_encoded = encode_text(preprocess_text(prompt))
    prompt_embedding = model(prompt_encoded)[0].detach().numpy()[0]
    similarities = []
    for index, row in csv_data.iterrows():
        description = row['job_description']
        description_encoded = encode_text(preprocess_text(description))
        description_embedding = model(description_encoded)[0].detach().numpy()[0]
        similarity = cosine_similarity([prompt_embedding], [description_embedding])[0][0]
        similarities.append((row['job_id'], similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities

# Split data into train, eval, and test sets
def split_data(data, test_size=0.2, eval_size=0.2):
    train_data, test_data = train_test_split(data, test_size=test_size)
    train_data, eval_data = train_test_split(train_data, test_size=eval_size)
    return train_data, eval_data, test_data

# Calculate accuracy
def calculate_accuracy(predictions, labels):
    return accuracy_score(labels, predictions)

# Example usage
if __name__ == "__main__":
    # Load CSV data
    csv_data = load_csv("job_descriptions.csv")

    # Sample prompt
    prompt = "Data scientist with expertise in machine learning and Python"

    # Split data into train, eval, and test sets
    train_data, eval_data, test_data = split_data(csv_data)

    # Match job descriptions based on the prompt
    matched_jobs_train = match_job_descriptions(train_data, prompt)
    matched_jobs_eval = match_job_descriptions(eval_data, prompt)
    matched_jobs_test = match_job_descriptions(test_data, prompt)

    # Display top matched jobs
    print("Top matched jobs on training set:")
    for job_id, similarity in matched_jobs_train[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")

    print("Top matched jobs on evaluation set:")
    for job_id, similarity in matched_jobs_eval[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")

    print("Top matched jobs on testing set:")
    for job_id, similarity in matched_jobs_test[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")

    # Calculate accuracy on the test set (for demonstration purposes)
    # In reality, you would need labeled data to calculate accuracy
    test_predictions = [1 if similarity > 0.5 else 0 for _, similarity in matched_jobs_test]
    test_labels = [1] * len(test_predictions)  # Example labels, replace with actual labels
    accuracy = calculate_accuracy(test_predictions, test_labels)
    print(f"Accuracy on the test set: {accuracy}")


In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Load CSV data
def load_csv(file_path):
    return pd.read_csv(file_path)

# Preprocess text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))
allowed_word_types = ["N", "V", "J", "R"]  # Nouns, Verbs, Adjectives, Adverbs

def preprocess_text(text):
    word_tokens = word_tokenize(text)
    tagged_words = pos_tag(word_tokens)
    cleaned_words = []
    for word, tag in tagged_words:
        if word.lower() not in stop_words and tag[0] in allowed_word_types:
            cleaned_words.append(word)
    cleaned_text = " ".join(cleaned_words)
    return "[CLS] " + cleaned_text + " [SEP]"

# Tokenize and encode text
def encode_text(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    return torch.tensor(indexed_tokens).unsqueeze(0)  # Add batch dimension

# Match job descriptions based on prompt
def match_job_descriptions(csv_data, prompt):
    prompt_encoded = encode_text(preprocess_text(prompt))
    with torch.no_grad():
        prompt_embedding = model(prompt_encoded)[0][:, 0, :].numpy()
    similarities = []
    for index, row in csv_data.iterrows():
        description = row['job_description']
        description_encoded = encode_text(preprocess_text(description))
        with torch.no_grad():
            description_embedding = model(description_encoded)[0][:, 0, :].numpy()
        similarity = cosine_similarity(prompt_embedding, description_embedding)[0][0]
        similarities.append((row['job_id'], similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities

# Split data into train, eval, and test sets
def split_data(data, test_size=0.2, eval_size=0.2):
    train_data, test_data = train_test_split(data, test_size=test_size)
    train_data, eval_data = train_test_split(train_data, test_size=eval_size)
    return train_data, eval_data, test_data

# Example usage
if __name__ == "__main__":
    # Load CSV data
    csv_data = load_csv("job_descriptions.csv")

    # Sample prompt
    prompt = "Data scientist with expertise in machine learning and Python"

    # Split data into train, eval, and test sets
    train_data, eval_data, test_data = split_data(csv_data)

    # Match job descriptions based on the prompt
    matched_jobs_train = match_job_descriptions(train_data, prompt)
    matched_jobs_eval = match_job_descriptions(eval_data, prompt)
    matched_jobs_test = match_job_descriptions(test_data, prompt)

    # Display top matched jobs
    print("Top matched jobs on training set:")
    for job_id, similarity in matched_jobs_train[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")

    print("Top matched jobs on evaluation set:")
    for job_id, similarity in matched_jobs_eval[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")

    print("Top matched jobs on testing set:")
    for job_id, similarity in matched_jobs_test[:5]:
        print(f"Job ID: {job_id}, Similarity: {similarity}")


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertModel

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load CSV data
def load_csv(file_path):
    return pd.read_csv(file_path)

# Preprocess text
def preprocess_text(text):
    return "[CLS] " + text + " [SEP]"

# Extract features from job description
def extract_features(text):
    doc = nlp(text)
    skills = [entity.text for entity in doc.ents if entity.label_ == 'SKILL']
    locations = [entity.text for entity in doc.ents if entity.label_ == 'GPE']
    experiences = [entity.text for entity in doc.ents if entity.label_ == 'EXPERIENCE']
    designations = [entity.text for entity in doc.ents if entity.label_ == 'TITLE']
    responsibilities = [chunk.text for chunk in doc.noun_chunks if 'responsibility' in chunk.text.lower()]
    return skills, locations, experiences, designations, responsibilities

# Tokenize and encode text
def encode_text(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    return torch.tensor(indexed_tokens).unsqueeze(0)  # Add batch dimension

# Extract features and encode text for training
def preprocess_data(data):
    features = []
    labels = []
    for _, row in data.iterrows():
        skills, _, _, _, _ = extract_features(row['job_description'])
        features.append(encode_text(preprocess_text(row['job_description'])))
        # Example label - 1 if 'Data Scientist' is in the designation, else 0
        labels.append(1 if 'Data Scientist' in row['job_description'] else 0)
    return torch.cat(features, dim=0), torch.tensor(labels)

# Neural network model
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.bert = model
        self.fc = nn.Linear(768, 1)  # Output size is 1 for binary classification

    def forward(self, input_ids):
        outputs = self.bert(input_ids)[0][:, 0, :]  # Use CLS token output
        return torch.sigmoid(self.fc(outputs))

# Train the model
def train_model(model, train_loader, eval_loader, num_epochs=5, lr=0.001):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    train_losses = []
    eval_losses = []
    train_accuracies = []
    eval_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        correct_train = 0
        total_train = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels.float())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.round(outputs).cpu().detach().numpy(), labels.cpu().detach().numpy()
            correct_train += (predicted == labels.numpy()).sum()
            total_train += len(labels)

        train_losses.append(train_loss / len(train_loader))
        train_accuracies.append(correct_train / total_train)

        model.eval()
        eval_loss = 0.0
        correct_eval = 0
        total_eval = 0
        with torch.no_grad():
            for inputs, labels in eval_loader:
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels.float())
                eval_loss += loss.item()
                _, predicted = torch.round(outputs).cpu().detach().numpy(), labels.cpu().detach().numpy()
                correct_eval += (predicted == labels.numpy()).sum()
                total_eval += len(labels)

        eval_losses.append(eval_loss / len(eval_loader))
        eval_accuracies.append(correct_eval / total_eval)

        print(f"Epoch {epoch + 1}/{num_epochs}, "
              f"Train Loss: {train_losses[-1]:.4f}, "
              f"Eval Loss: {eval_losses[-1]:.4f}, "
              f"Train Accuracy: {train_accuracies[-1]:.4f}, "
              f"Eval Accuracy: {eval_accuracies[-1]:.4f}")

    return train_losses, eval_losses, train_accuracies, eval_accuracies

# Plot accuracy and loss
def plot_metrics(train_losses, eval_losses, train_accuracies, eval_accuracies):
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(eval_losses, label='Eval Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Evaluation Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='Train Accuracy')
    plt.plot(eval_accuracies, label='Eval Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training and Evaluation Accuracy')
    plt.legend()

    plt.show()

# Example usage
if __name__ == "__main__":
    # Load CSV data
    csv_data = load_csv("job_descriptions.csv")

    # Preprocess data
    X, y = preprocess_data(csv_data)

    # Split data into train, eval, and test sets
    X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2)
    train_loader = torch.utils.data.DataLoader(list(zip(X_train, y_train)), batch_size=16, shuffle=True)
    eval_loader = torch.utils.data.DataLoader(list(zip(X_eval, y_eval)), batch_size=16)

    # Initialize model
    model = Model()

    # Train the model
    train_losses, eval_losses, train_accuracies, eval_accuracies = train_model(model, train_loader, eval_loader)

    # Plot metrics
    plot_metrics(train_losses, eval_losses, train_accuracies, eval_accuracies)

    # Predict on test set
    X_test, y_test = preprocess_data(test_data)
    test_loader = torch.utils.data.DataLoader(list(zip(X_test, y_test)), batch_size=16)
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.round(outputs).cpu().detach().numpy(), labels.cpu().detach().numpy()
            predictions.extend(predicted)
            true_labels.extend(labels.numpy())

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy on the test set: {accuracy:.4f}")
