In [None]:
import os
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import pickle
from tqdm import tqdm 
from sklearn.metrics import precision_score  
from sklearn.metrics.pairwise import cosine_similarity

# Define dataset path
dataset_path = 'data/naukri_data_science_jobs_india.csv'

# Check if dataset exists
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset not found at {dataset_path}. Please ensure the file exists.")

In [None]:
# Load dataset
data = pd.read_csv(dataset_path)

# Clean data: Replace NaN with empty strings and ensure all skills are strings
data['Skills/Description'] = data['Skills/Description'].fillna('').astype(str)

print(f"Dataset loaded and cleaned successfully with {len(data)} rows!")


Dataset loaded and cleaned successfully with 12000 rows!


In [None]:
# Load BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Generate BERT embeddings for job descriptions
data['embeddings'] = [
    get_bert_embedding(row['Skills/Description'])
    for _, row in tqdm(data.iterrows(), total=len(data), desc="Generating BERT embeddings")
]

# Convert embeddings to NumPy array
embeddings = np.array(data['embeddings'].tolist())
print(f"Generated embeddings for {len(embeddings)} jobs.")


Generating BERT embeddings: 100%|██████████| 12000/12000 [08:47<00:00, 22.76it/s]

Generated embeddings for 12000 jobs.





In [None]:
# Save embeddings and job metadata
output_dir = 'models'
os.makedirs(output_dir, exist_ok=True)

embeddings_data = {
    'jobs': data.to_dict(orient='records'),
    'model_name': 'bert-base-uncased'
}
with open(os.path.join(output_dir, 'job_embeddings.pkl'), 'wb') as f:
    pickle.dump(embeddings_data, f)

print("Embeddings and metadata saved successfully!")


Embeddings and metadata saved successfully!


In [None]:
# Build FAISS index
dimension = embeddings.shape[1]  
index = faiss.IndexFlatL2(dimension)  
index.add(np.array(embeddings, dtype='float32')) 

print(f"FAISS index contains {index.ntotal} embeddings.")

# Save FAISS index
faiss.write_index(index, os.path.join(output_dir, 'job_index.faiss'))
print("FAISS index saved successfully!")


FAISS index contains 12000 embeddings.
FAISS index saved successfully!


In [None]:
def recommend_jobs(user_skills, top_n=5):
    user_embedding = get_bert_embedding(user_skills).reshape(1, -1)
    
    # Compute cosine similarities
    similarities = cosine_similarity(user_embedding, embeddings)
    
    top_indices = similarities.argsort()[0][-top_n:][::-1]
    
    print("Top recommendations:")
    recommended_jobs = []
    for idx in top_indices:
        job_role = data.iloc[idx]['Job_Role']
        similarity = similarities[0][idx]
        print(f"{len(recommended_jobs)+1}. {job_role} | Similarity: {similarity:.4f}")
        
        recommended_jobs.append(job_role)
    
    return recommended_jobs


In [None]:
def calculate_precision(true_labels, predicted_labels):
    return precision_score(true_labels, predicted_labels, average='micro')


In [8]:
# Example usage
user_skills = "Data engineering, Python, SQL, AWS"
recommended_jobs = recommend_jobs(user_skills)

true_labels = [1, 0, 1, 0, 1]  
predicted_labels = [1, 0, 1, 1, 1]  

# Calculate precision for the recommendations (example values)
precision = calculate_precision(true_labels, predicted_labels)
print(f"Recommendation precision: {precision:.4f}")


Top recommendations:
1. Data Engineer | Similarity: 0.9497
2. Data Analyst - SAS + Python/SQL - 4-8yrs - Worked with big datasets | Similarity: 0.9439
3. Data Scientist | Similarity: 0.9393
4. Urgently Hiring Fresher For Data Engineer | Similarity: 0.9369
5. Data Analyst | Similarity: 0.9346
Recommendation precision: 0.8000


In [9]:
# Example usage
user_skills = "Data engineering, Python, SQL, AWS"
recommended_jobs = recommend_jobs(user_skills)

true_labels = [1, 0, 1, 0, 1] 
predicted_labels = [1, 0, 1, 1, 1]  

# Calculate precision for the recommendations (example values)
precision = calculate_precision(true_labels, predicted_labels)
print(f"Recommendation precision: {precision:.4f}")


Top recommendations:
1. Data Engineer | Similarity: 0.9497
2. Data Analyst - SAS + Python/SQL - 4-8yrs - Worked with big datasets | Similarity: 0.9439
3. Data Scientist | Similarity: 0.9393
4. Urgently Hiring Fresher For Data Engineer | Similarity: 0.9369
5. Data Analyst | Similarity: 0.9346
Recommendation precision: 0.8000
