In [41]:
# 1. Import Libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import precision_score, recall_score
import numpy as np
import pickle
import re
from typing import List, Dict, Union, Optional
import pandas as pd
import os

In [37]:
# 2. Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/long/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/long/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/long/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
# 3.1 Text Cleaning and Skill Extraction
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
    
    def clean_text(self, text: str) -> str:
        # Lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-z\s]', '', text)
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
        return ' '.join(tokens)
    
    def extract_skills(self, text: str, skills_categories: Dict[str, List[str]]) -> Dict[str, List[str]]:
        skills_found = {category: [] for category in skills_categories}
        for category, skills in skills_categories.items():
            for skill in skills:
                pattern = r'\b' + re.escape(skill.lower()) + r'\b'
                if re.search(pattern, text):
                    skills_found[category].append(skill.lower())
        return skills_found

In [39]:
# 4. Define Skills Categories
skills_categories = {
    "programming": ["python", "java", "c++", "javascript", "sql"],
    "data_science": ["machine learning", "data analysis", "numpy", "pandas", "scikit-learn"],
    "web_dev": ["django", "flask", "react", "angular", "html", "css"],
    "cloud": ["aws", "azure", "gcp", "docker", "kubernetes"],
    "tools": ["git", "linux", "jupyter", "excel", "tableau"]
}

In [40]:
# 5. Import Your CSV Files
jd_csv_path = '/home/long/Downloads/AAAIIIIIIIII/CV-JD-Matching-System/data/raw/jd/train/train.csv'
cv_csv_path = '/home/long/Downloads/AAAIIIIIIIII/CV-JD-Matching-System/data/raw/cv/test/test.csv'

# Check if files exist
if not os.path.exists(jd_csv_path):
    print(f"JD file not found at {jd_csv_path}. Please check the path.")
if not os.path.exists(cv_csv_path):
    print(f"CV file not found at {cv_csv_path}. Please check the path.")

# Read CSV files
jd_df = pd.read_csv(jd_csv_path)
cv_df = pd.read_csv(cv_csv_path)

# Extract text columns (assuming the text columns are named 'text')
jd_raw_text = jd_df['text'].iloc[0]  # Adjust the index as needed
cv_raw_text = cv_df['text'].iloc[0]  # Adjust the index as needed

EmptyDataError: No columns to parse from file

In [None]:
# 6. Preprocess the Text
# Initialize Preprocessor
preprocessor = TextPreprocessor()

# Clean the texts
cleaned_jd = preprocessor.clean_text(jd_raw_text)
cleaned_cv = preprocessor.clean_text(cv_raw_text)

print("Cleaned Job Description:")
print(cleaned_jd[:500])  # Print first 500 characters

print("\nCleaned CV:")
print(cleaned_cv[:500])  # Print first 500 characters

# Extract Skills
jd_skills = preprocessor.extract_skills(cleaned_jd, skills_categories)
cv_skills = preprocessor.extract_skills(cleaned_cv, skills_categories)

print("JD Skills:", jd_skills)
print("CV Skills:", cv_skills)

In [None]:
# 6. Preprocess the Text

# Initialize Preprocessor
preprocessor = TextPreprocessor()

# Extract text from PDFs
jd_raw_text = extract_text_from_pdf(jd_pdf_path)
cv_raw_text = extract_text_from_pdf(cv_pdf_path)

# Clean the texts
cleaned_jd = preprocessor.clean_text(jd_raw_text)
cleaned_cv = preprocessor.clean_text(cv_raw_text)

print("Cleaned Job Description:")
print(cleaned_jd[:500])  # Print first 500 characters

print("\nCleaned CV:")
print(cleaned_cv[:500])  # Print first 500 characters

In [None]:
# 7.1 Build Vocabulary
def build_vocab(jd_skills: Dict[str, List[str]], cv_skills: Dict[str, List[str]]) -> Dict[str, int]:
    all_skills = []
    for category in skills_categories:
        all_skills.extend(jd_skills.get(category, []))
        all_skills.extend(cv_skills.get(category, []))
    vocab = {skill.lower(): idx+1 for idx, skill in enumerate(sorted(set(all_skills)))}
    return vocab

vocab = build_vocab(jd_skills, cv_skills)
print("Vocabulary:", vocab)

In [None]:
# 7.2 Encode Skills into Numerical Format
def encode_skills(skills: Dict[str, List[str]], vocab: Dict[str, int]) -> List[int]:
    encoded = []
    for category in skills_categories:
        for skill in skills.get(category, []):
            encoded.append(vocab.get(skill.lower(), 0))
    return encoded

jd_encoded = encode_skills(jd_skills, vocab)
cv_encoded = encode_skills(cv_skills, vocab)

print("Encoded JD Skills:", jd_encoded)
print("Encoded CV Skills:", cv_encoded)

In [None]:
# 8. Define the RNN Model
class RNNModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, output_dim: int):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim * 2, hidden_dim, batch_first=True, nonlinearity='relu')
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, jd, cv):
        embedded_jd = self.embedding(jd)  # [batch_size, seq_length, embedding_dim]
        embedded_cv = self.embedding(cv)  # [batch_size, seq_length, embedding_dim]
        combined = torch.cat((embedded_jd, embedded_cv), dim=2)  # [batch_size, seq_length, embedding_dim*2]
        _, hidden = self.rnn(combined)  # hidden: [1, batch_size, hidden_dim]
        out = self.fc(hidden.squeeze(0))  # [batch_size, output_dim]
        out = self.sigmoid(out)
        return out

In [None]:
# 9. Create Dataset and DataLoader
class SkillsDataset(Dataset):
    def __init__(self, jd_skills_list: List[List[int]], cv_skills_list: List[List[int]], labels: List[float], vocab: Dict[str, int]):
        self.jd_skills_list = jd_skills_list
        self.cv_skills_list = cv_skills_list
        self.labels = labels
        self.vocab = vocab
    
    def __len__(self):
        return len(self.jd_skills_list)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.jd_skills_list[idx], dtype=torch.long),
            torch.tensor(self.cv_skills_list[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.float)
        )

# Example with a single sample
jd_skills_list = [jd_encoded]  # List of lists
cv_skills_list = [cv_encoded]  # List of lists
labels = [1.0]  # 1 for a positive match

dataset = SkillsDataset(jd_skills_list, cv_skills_list, labels, vocab)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Verify the Data Structures
print("Type of jd_skills_list:", type(jd_skills_list))
print("Type of cv_skills_list:", type(cv_skills_list))
print("Type of labels:", type(labels))

print("First element in jd_skills_list:", jd_skills_list[0])
print("First element in cv_skills_list:", cv_skills_list[0])
print("First label:", labels[0])

In [82]:
# 10. Initialize Model, Loss, and Optimizer
vocab_size = len(vocab) + 1  # +1 for padding_idx=0
embedding_dim = 100
hidden_dim = 128
output_dim = 1

model = RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# 11. Training Loop with Early Stopping
epochs = 50
patience = 5
best_loss = float('inf')
counter = 0
best_model_path = 'best_model.pth'

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for jd, cv, label in dataloader:
        optimizer.zero_grad()
        outputs = model(jd, cv)
        
        # Ensure labels have shape [batch_size, 1]
        label = label.unsqueeze(1)
        
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(dataloader)
    print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')
    
    # Early Stopping
    if avg_loss < best_loss:
        best_loss = avg_loss
        counter = 0
        # Save the best model
        torch.save(model.state_dict(), best_model_path)
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered.")
            break

In [79]:
# 12. Load the Best Model
model.load_state_dict(torch.load(best_model_path))
model.eval()

In [None]:
# 13. Compute Matching Percentage and Analyze Missing Skills
def compute_matching_percentage(jd_skills: Dict[str, List[str]], cv_skills: Dict[str, List[str]]) -> (float, List[str]):
    jd_set = set()
    cv_set = set()
    for category in skills_categories:
        jd_set.update(jd_skills.get(category, []))
        cv_set.update(cv_skills.get(category, []))
    intersection = jd_set.intersection(cv_set)
    percentage = (len(intersection) / len(jd_set)) * 100 if jd_set else 0
    missing_skills = list(jd_set - cv_set)
    return percentage, missing_skills

matching_percentage, missing_skills = compute_matching_percentage(jd_skills, cv_skills)
print(f"Matching Percentage: {matching_percentage:.2f}%")
print(f"Missing Skills: {missing_skills}")

In [None]:
# 14. Recommend Courses Based on Missing Skills
def recommend_courses(missing_skills: List[str]) -> List[str]:
    course_mapping = {
        'python': 'Python for Everybody',
        'machine learning': 'Introduction to Machine Learning',
        'data analysis': 'Data Analysis with Pandas',
        'java': 'Java Programming Masterclass',
        'javascript': 'JavaScript Essentials',
        'c++': 'C++ Fundamentals',
        'django': 'Django for Beginners',
        'flask': 'Flask Web Development',
        'react': 'React - The Complete Guide',
        'aws': 'AWS Certified Solutions Architect',
        'docker': 'Docker for Developers',
        'git': 'Version Control with Git',
        'sql': 'SQL for Data Science',
        'numpy': 'Numerical Computing with NumPy',
        'pandas': 'Data Manipulation with Pandas',
        'scikit-learn': 'Machine Learning with scikit-learn',
        'html': 'HTML & CSS Essentials',
        'css': 'CSS - The Complete Guide',
        'angular': 'Angular - The Complete Guide',
        'gcp': 'Google Cloud Platform Fundamentals',
        'kubernetes': 'Kubernetes for Developers',
        'linux': 'Linux Administration Bootcamp',
        'jupyter': 'Jupyter Notebook for Data Science',
        'excel': 'Mastering Excel for Data Analysis',
        'tableau': 'Tableau for Data Visualization'
        # Add more mappings as needed
    }
    recommended = [course_mapping.get(skill.lower(), f"Course for {skill}") for skill in missing_skills]
    return recommended

recommendations = recommend_courses(missing_skills)
print("\nRecommended Courses:")
for course in recommendations:
    print(f"- {course}")

In [None]:
# 16. Save the Model as Pickle
# Note: Using torch.save is recommended, but here we use pickle as per your requirement
with open('rnn_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("\nModel saved as 'rnn_model.pkl'")

In [None]:
# 15. Evaluate Precision and Recall
# Dummy true labels
true_labels = [1.0]  # Assuming it's a positive match

# Model predictions
with torch.no_grad():
    predictions = model(
        torch.tensor(jd_encoded, dtype=torch.long).unsqueeze(0),
        torch.tensor(cv_encoded, dtype=torch.long).unsqueeze(0)
    )
    predicted_labels = (predictions.squeeze() > 0.5).int().tolist()

print(f"\nPredicted Labels: {predicted_labels}")

# Calculate Precision and Recall
precision = precision_score(true_labels, predicted_labels, zero_division=0)
recall = recall_score(true_labels, predicted_labels, zero_division=0)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

In [None]:
# 15. Evaluate Precision and Recall
# Dummy true labels
true_labels = [1.0]  # Assuming it's a positive match

# Model predictions
with torch.no_grad():
    predictions = model(
        torch.tensor(jd_encoded, dtype=torch.long).unsqueeze(0),
        torch.tensor(cv_encoded, dtype=torch.long).unsqueeze(0)
    )
    predicted_labels = (predictions.squeeze() > 0.5).int().tolist()

print(f"\nPredicted Labels: {predicted_labels}")

# Calculate Precision and Recall
precision = precision_score(true_labels, predicted_labels, zero_division=0)
recall = recall_score(true_labels, predicted_labels, zero_division=0)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

In [None]:
# 16. Save the Model as Pickle
# Note: Using torch.save is recommended, but here we use pickle as per your requirement
with open('rnn_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("\nModel saved as 'rnn_model.pkl'")