# Recommendation Pipeline


In [1]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import ast
from typing import List, Dict, Union, Tuple, Optional
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import annoy
from pathlib import Path
import time
import networkx as nx
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pickle
import faiss
from dataclasses import dataclass
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import ollama
import re

# Initialization of job recommendation system components

# Define cache paths
CACHE_DIR = Path('recommendation_cache')
CACHE_DIR.mkdir(exist_ok=True)

CACHE_PATHS = {
    'graph': CACHE_DIR / 'final_complete_graph.pkl',
    'dataframe': CACHE_DIR / 'final_graph_dataframe.pkl',
    'annoy_index': CACHE_DIR / 'annoy_index.ann',
    'faiss_index': CACHE_DIR / 'faiss_index.pkl', 
    'graph_metrics': CACHE_DIR / 'graph_metrics.pkl',
    'embeddings_norm': CACHE_DIR / 'normalized_embeddings.pkl',
    'degree_scores': CACHE_DIR / 'degree_scores.pkl',
    'model': CACHE_DIR / 'model_cache.pkl',
    'node_embeddings': CACHE_DIR / 'node_embeddings.pt'
}

# Load node embeddings
print("Loading node embeddings...")
if CACHE_PATHS['node_embeddings'].exists():
    print("Loading node embeddings from cache...")
    node_embeddings = torch.load(CACHE_PATHS['node_embeddings'])
    print("Node embeddings loaded successfully!")
else:
    raise FileNotFoundError("Node embeddings file not found. Please ensure node embeddings have been generated and saved from step 8.")

# MultiLabelBinarizer setup for job type decoding
mlb = MultiLabelBinarizer()
mlb.fit([['contract'], ['fulltime'], ['internship'], ['parttime'], ['temporary']])

# Geocoder initialization
geolocator = Nominatim(user_agent="job_recommender_v1")

# Load or initialize model and tokenizer
def load_or_init_model():
    print("Loading/initializing language model...")
    if CACHE_PATHS['model'].exists():
        print("Loading model from cache...")
        with open(CACHE_PATHS['model'], 'rb') as f:
            cache = pickle.load(f)
            tokenizer = cache['tokenizer']
            model = cache['model']
        print("Model loaded from cache successfully!")
    else:
        print("Downloading and caching model...")
        tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
        model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
        
        with open(CACHE_PATHS['model'], 'wb') as f:
            pickle.dump({
                'tokenizer': tokenizer,
                'model': model
            }, f)
        print("Model downloaded and cached successfully!")
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")
    model = model.to(device)
    return tokenizer, model, device

# Initialize tokenizer, model, and device
tokenizer, model, device = load_or_init_model()

# Load graph and dataframe
print("Loading graph and dataframe...")
if CACHE_PATHS['graph'].exists() and CACHE_PATHS['dataframe'].exists():
    with open(CACHE_PATHS['graph'], 'rb') as f:
        graph = pickle.load(f)
    with open(CACHE_PATHS['dataframe'], 'rb') as f:
        df = pickle.load(f)
    print("Graph and dataframe loaded successfully!")
else:
    print("Error: Graph or dataframe pickle files do not exist. Please check the paths.")


Loading node embeddings...
Loading node embeddings from cache...
Node embeddings loaded successfully!
Loading/initializing language model...
Loading model from cache...


  node_embeddings = torch.load(CACHE_PATHS['node_embeddings'])


Model loaded from cache successfully!
Using device: cuda
Loading graph and dataframe...
Graph and dataframe loaded successfully!


In [2]:
import traceback
# Function to build Annoy index
def build_ann_index(embeddings_np, n_trees=100):
    print("\nBuilding/Loading Annoy index...")
    if CACHE_PATHS['annoy_index'].exists():
        print("Loading cached Annoy index...")
        index = annoy.AnnoyIndex(embeddings_np.shape[1], 'angular')
        index.load(str(CACHE_PATHS['annoy_index']))
        print("Annoy index loaded successfully!")
        return index
    
    print("Building new Annoy index...")
    index = annoy.AnnoyIndex(embeddings_np.shape[1], 'angular')
    
    for i in range(len(embeddings_np)):
        if i % 1000 == 0:
            print(f"Adding item {i}/{len(embeddings_np)} to Annoy index...")
        index.add_item(i, embeddings_np[i])
    
    print("Building index with trees...")
    index.build(n_trees)
    print("Saving index to disk...")
    index.save(str(CACHE_PATHS['annoy_index']))
    print("Annoy index built and saved successfully!")
    return index

# Function to build FAISS index
def build_faiss_index(embeddings):
    print("\nBuilding/Loading FAISS index...")
    if CACHE_PATHS['faiss_index'].exists():
        print("Loading cached FAISS index...")
        with open(CACHE_PATHS['faiss_index'], 'rb') as f:
            index = pickle.load(f)
        print("FAISS index loaded successfully!")
        return index
            
    print("Building new FAISS index...")
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    print("Normalizing embeddings...")
    normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis]
    print("Adding vectors to index...")
    index.add(normalized_embeddings.astype('float32'))
    
    print("Saving index to disk...")
    with open(CACHE_PATHS['faiss_index'], 'wb') as f:
        pickle.dump(index, f)
    print("FAISS index built and saved successfully!")
    return index

# Function to cache normalized embeddings
def cache_normalized_embeddings(embeddings_np):
    print("\nPreparing normalized embeddings...")
    if CACHE_PATHS['embeddings_norm'].exists():
        print("Loading cached normalized embeddings...")
        with open(CACHE_PATHS['embeddings_norm'], 'rb') as f:
            normalized = pickle.load(f)
        print("Normalized embeddings loaded successfully!")
        return normalized
    
    print("Computing normalized embeddings...")
    normalized = embeddings_np / np.linalg.norm(embeddings_np, axis=1)[:, np.newaxis]
    print("Saving normalized embeddings to disk...")
    with open(CACHE_PATHS['embeddings_norm'], 'wb') as f:
        pickle.dump(normalized, f)
    print("Normalized embeddings cached successfully!")
    return normalized

def compute_pagerank_torch(graph, damping=0.85, max_iter=100, tol=1e-6):
    print("\nComputing PageRank scores...")
    # Create node ID to index mapping
    node_map = {node: idx for idx, node in enumerate(graph.nodes())}
    reverse_map = {idx: node for node, idx in node_map.items()}
    n = len(node_map)
    
    # Convert edges using the mapping
    edges = [(node_map[e[0]], node_map[e[1]]) for e in graph.edges()]
    row = np.array([e[0] for e in edges])
    col = np.array([e[1] for e in edges])
    data = np.ones(len(edges))
    adj_matrix = csr_matrix((data, (row, col)), shape=(n, n))

    print("Normalizing adjacency matrix...")
    out_degree = np.array(adj_matrix.sum(axis=1)).flatten()
    out_degree[out_degree == 0] = 1
    D_inv = csr_matrix((1.0 / out_degree, (np.arange(n), np.arange(n))), shape=(n, n))
    stochastic_matrix = D_inv @ adj_matrix

    print("Converting to PyTorch sparse format...")
    coo_matrix = stochastic_matrix.tocoo()
    indices = torch.tensor([coo_matrix.row, coo_matrix.col], dtype=torch.long)
    values = torch.tensor(coo_matrix.data, dtype=torch.float32)
    sparse_matrix = torch.sparse.FloatTensor(indices, values, torch.Size([n, n])).cuda()

    print("Initializing PageRank computation...")
    pagerank_vector = torch.ones(n, device="cuda") / n

    for i in range(max_iter):
        new_pagerank_vector = (1 - damping) / n + damping * torch.sparse.mm(sparse_matrix.t(), pagerank_vector.unsqueeze(1)).squeeze()
        if torch.norm(new_pagerank_vector - pagerank_vector, p=1) < tol:
            print(f"PageRank converged after {i + 1} iterations")
            break
        pagerank_vector = new_pagerank_vector

    print("PageRank computation completed!")
    # Convert back to original node IDs
    pagerank_dict = {reverse_map[i]: score for i, score in enumerate(pagerank_vector.cpu().numpy())}
    return pagerank_dict

# Function to cache graph metrics
def cache_graph_metrics(graph):
    print("\nComputing/Loading graph metrics...")
    if CACHE_PATHS['graph_metrics'].exists():
        print("Loading cached graph metrics...")
        with open(CACHE_PATHS['graph_metrics'], 'rb') as f:
            metrics = pickle.load(f)
        print("Graph metrics loaded successfully!")
        return metrics
    
    print("Computing graph metrics...")
    print("Computing degree centrality...")
    degree = dict(graph.degree())
    max_degree = max(degree.values())
    
    print("Computing PageRank...")
    pagerank = compute_pagerank_torch(graph)
    
    print("Computing core numbers...")
    core_numbers = nx.core_number(graph)
    
    metrics = {
        'degree': degree,
        'max_degree': max_degree,
        'pagerank': pagerank,
        'core_numbers': core_numbers
    }
    
    print("Saving graph metrics to disk...")
    with open(CACHE_PATHS['graph_metrics'], 'wb') as f:
        pickle.dump(metrics, f)
    print("Graph metrics cached successfully!")
    return metrics

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_job_description_embedding(text, tokenizer, model, max_chunk_length=512, overlap=50, device='cpu'):
    print("\nGenerating job description embedding...")
    tokens_per_chunk = max_chunk_length - 2  
    words = text.split()
    chunks = [' '.join(words[i:i+tokens_per_chunk]) for i in range(0, len(words), tokens_per_chunk - overlap)]
    
    embeddings = []
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        encoded_input = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=max_chunk_length).to(device)
        
        with torch.no_grad():
            model_output = model(**encoded_input)
        
        sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)
        embeddings.append(sentence_embedding.cpu().numpy())

    if embeddings:
        print("Averaging chunk embeddings...")
        return np.mean(embeddings, axis=0).squeeze()
    else:
        print("Warning: No embeddings generated, returning zero vector")
        return np.zeros((384,))

def get_job_title_embedding(text, tokenizer, model, device='cpu'):
    print(f"\nGenerating embedding for job title: {text}")
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
    
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)
    print("Job title embedding generated successfully!")
    
    return sentence_embedding.cpu().numpy().squeeze()

def process_job_description_with_LLM(document_text):
    """Process a single document using the same LLM setup."""
    model_name = 'capybarahermes-2.5-mistral-7b.Q5_K_M.gguf:latest'
    prompt = f"""You are an expert in understanding job descriptions and extracting the details and even nuanced requirements for the job. Your goal is to read the input slowly and take time to consider what is written, extract the information and break it down into these 3 aspects:
    1. responsibilites 
    2. qualifications
    3. skills, technical and non-technical
and summarize it in point form line by line.
With each aspect answered, ensure that each of the aspects are properly differentiated and avoid overlaps as much as possible."""
    
    try:
        response = ollama.chat(
            model=model_name,
            messages=[
                {'role': 'system', 'content': prompt},
                {'role': 'user', 'content': document_text}
            ]
        )
        response_text = response['message']['content']
        
        # Clean the response text
        # Remove special characters except alphanumeric, spaces, periods and commas
        cleaned_text = re.sub(r'[^A-Za-z0-9\s.,]', '', response_text)
        # Remove point formatted numbers but keep time patterns
        cleaned_text = re.sub(r'(?<!\d)(\d+)\.(?!\d)', '', cleaned_text).strip()
        
        return cleaned_text
    except Exception as e:
        print(f"Error processing document: {e}")
        return None

@dataclass
class UserPreferences:
    """User preferences for job recommendations"""
    location: Optional[Tuple[float, float]] = None
    location_name: Optional[str] = None  # Added for geocoding
    job_title: Optional[str] = None
    job_description: Optional[str] = None
    max_distance_km: float = 10.0
    weights: dict = None
    remote_preference: bool = False
    
    def __post_init__(self):
        if self.weights is None:
            self.weights = {
                'title_similarity': 0.3,
                'description_similarity': 0.2,
                'location_proximity': 0.2,
                'degree': 0.1,
                'pagerank': 0.1,
                'core_number': 0.1
            }
def get_graph_based_recommendations(
    graph: nx.Graph,
    node_embeddings: torch.Tensor,
    preferences: UserPreferences,
    n_hops: int = 2,
    top_k: int = 5,
    n_candidates: int = 1000
) -> List[dict]:
    """Generate recommendations using both FAISS and Annoy for hybrid search"""
    print("\nStarting recommendation generation...")
    start_time = time.time()
    
    print("Preparing embeddings...")
    embeddings_np = node_embeddings.numpy()
    print(f"Embeddings shape: {embeddings_np.shape}")
    
    print("\nInitializing search indices...")
    faiss_index = build_faiss_index(embeddings_np)
    annoy_index = build_ann_index(embeddings_np)
    normalized_embeddings = cache_normalized_embeddings(embeddings_np)
    graph_metrics = cache_graph_metrics(graph)
    
    # Create mapping from string ID to graph node name
    id_to_node = {row['id']: f"job_{idx}" for idx, row in df.iterrows()}
    node_to_id = {f"job_{idx}": row['id'] for idx, row in df.iterrows()}
    
    print("\nGenerating candidate nodes...")
    candidate_indices = set()
    if preferences.job_title:
        print(f"Finding similar jobs to title: {preferences.job_title}")
        title_embedding = get_job_title_embedding(preferences.job_title, tokenizer, model, device)
        title_embedding = title_embedding / np.linalg.norm(title_embedding)
        
        print("Searching with FAISS...")
        D_faiss, I_faiss = faiss_index.search(title_embedding.astype('float32').reshape(1,-1), top_k)
        faiss_candidates = set(I_faiss[0])
        
        print("Searching with Annoy...")
        annoy_candidates = set(annoy_index.get_nns_by_vector(
            title_embedding, 
            top_k,
            search_k=-1
        ))
        
        candidate_indices = faiss_candidates.union(annoy_candidates)
        print(f"Found {len(candidate_indices)} candidate indices")
    else:
        print("No job title specified, using PageRank for candidate selection...")
        weights = np.array([graph_metrics['pagerank'][f"job_{i}"] for i in range(len(df))])
        weights = weights / weights.sum()
        candidate_indices = set(np.random.choice(
            len(df),
            size=min(n_candidates, len(df)),
            p=weights,
            replace=False
        ))
    
    print("\nComputing job scores...")
    
    # Determine which inputs are available
    has_title = preferences.job_title is not None
    has_description = preferences.job_description is not None
    has_location = preferences.location is not None
    
    # Set weights based on available inputs
    if has_title and has_description:
        # All semantic inputs available
        weights = {
            'title_similarity': 0.25,
            'description_similarity': 0.25,
            'pagerank': 0.15,
            'degree': 0.10,
            'core_number': 0.05,
            'location_proximity': 0.20 if has_location else 0.0
        }
    elif has_title or has_description:
        # Only one semantic input
        semantic_weight = 0.40
        graph_weight_total = 0.40
        weights = {
            'title_similarity': semantic_weight if has_title else 0.0,
            'description_similarity': semantic_weight if has_description else 0.0,
            'pagerank': 0.20,
            'degree': 0.15,
            'core_number': 0.05,
            'location_proximity': 0.20 if has_location else 0.0
        }
    else:
        # No semantic inputs
        weights = {
            'pagerank': 0.35,
            'degree': 0.35,
            'core_number': 0.10,
            'location_proximity': 0.20 if has_location else 0.0
        }
    
    # Normalize weights if location is not provided
    if not has_location:
        total_weight = sum(weights.values())
        weights = {k: v/total_weight for k, v in weights.items()}
    
    # Update preferences with new weights
    preferences.weights = weights
    
    # Pre-compute normalized graph metrics
    print("Normalizing graph metrics...")
    max_pagerank = max(graph_metrics['pagerank'].values())
    max_core_number = max(graph_metrics['core_numbers'].values())
    
    # Normalize degree scores for the subgraph
    subgraph = graph.subgraph([f"job_{i}" for i in candidate_indices])
    degree_scores = dict(subgraph.degree())
    max_degree = max(degree_scores.values()) if degree_scores else 1
    
    batch_size = 100
    job_scores = []
    total_batches = len(candidate_indices) // batch_size + (1 if len(candidate_indices) % batch_size else 0)
    
    for i in range(0, len(candidate_indices), batch_size):
        batch_indices = list(candidate_indices)[i:i + batch_size]
        current_batch = i // batch_size + 1
        print(f"\nProcessing batch {current_batch}/{total_batches}...")
        batch_scores = []
        
        for idx in batch_indices:
            try:
                node = f"job_{idx}"
                attrs = graph.nodes[node]
                
                score_components = {}
                
                # Normalize semantic similarities by mapping from [-1,1] to [0,1]
                if preferences.job_title:
                    raw_similarity = np.dot(
                        title_embedding,
                        normalized_embeddings[idx]
                    )
                    # Map from [-1,1] to [0,1] using (x + 1) / 2
                    score_components['title_similarity'] = (raw_similarity + 1) / 2

                if preferences.job_description:
                    desc_embedding = get_job_description_embedding(
                        preferences.job_description,
                        tokenizer,
                        model,
                        device=device
                    )
                    raw_similarity = np.dot(
                        desc_embedding,
                        normalized_embeddings[idx]
                    )
                    # Map from [-1,1] to [0,1] using (x + 1) / 2
                    score_components['description_similarity'] = (raw_similarity + 1) / 2
                
                # Normalize location proximity using exponential decay
                if preferences.location:
                    job_location = (
                        ast.literal_eval(attrs['lat_long'])
                        if isinstance(attrs['lat_long'], str)
                        else attrs['lat_long']
                    )
                    distance = geodesic(preferences.location, job_location).kilometers
                    score_components['location_proximity'] = np.exp(-distance / preferences.max_distance_km)
                
                # Normalize graph metrics
                score_components['degree'] = degree_scores[node] / max_degree
                score_components['pagerank'] = graph_metrics['pagerank'][node] / max_pagerank
                score_components['core_number'] = graph_metrics['core_numbers'][node] / max_core_number
                
                # Verify all components are in [0,1] range
                for component in score_components:
                    score_components[component] = max(0, min(1, score_components[component]))
                
                # Calculate final score using normalized components and weights
                final_score = sum(
                    score * preferences.weights.get(component, 0)
                    for component, score in score_components.items()
                )
                
                job_type_encoded = attrs['job_type_encoding']
                if isinstance(job_type_encoded, str):
                    job_type_encoded = ast.literal_eval(job_type_encoded)
                job_type_encoded = np.array(job_type_encoded)
                if job_type_encoded.ndim == 1:
                    job_type_encoded = job_type_encoded.reshape(1, -1)

                job_type_decoded = mlb.inverse_transform(job_type_encoded)[0]

                batch_scores.append({
                    'index': node_to_id[node],
                    'company': attrs['company'],
                    'job_type': job_type_decoded,
                    'location': attrs['lat_long'],
                    'is_remote': attrs['is_remote'],
                    'score_components': score_components,
                    'final_score': final_score
                })
                
            except Exception as e:
                print(f"Error processing index {idx}: {str(e)}")
                traceback.print_exc()
                continue
        
        job_scores.extend(batch_scores)
    
    print("\nSorting recommendations...")
    recommendations = sorted(
        job_scores,
        key=lambda x: x['final_score'],
        reverse=True
    )[:top_k]
    
    # Get URLs and titles for only the top recommendations
    for rec in recommendations:
        job_data = df.loc[df['id'] == rec['index']].iloc[0]
        rec['job_url'] = job_data['job_url']
        rec['job_url_direct'] = job_data['job_url_direct']
        rec['title'] = job_data['title']
    
        # Add cosine similarity matrix for top-k recommendations
    if len(recommendations) > 1:
        print("\nComputing similarity matrix for top recommendations...")
        
        # Get pre-computed embeddings for titles and descriptions
        title_embeddings = []
        desc_embeddings = []
        
        for rec in recommendations:
            job_data = df.loc[df['id'] == rec['index']].iloc[0]
            
            # Get pre-computed embeddings from DataFrame
            title_embeddings.append(job_data['job_title_embedding'])
            desc_embeddings.append(job_data['job_description_embedding'])
        
        # Convert to numpy arrays
        title_embeddings = np.array(title_embeddings)
        desc_embeddings = np.array(desc_embeddings)
        
        # Compute similarity matrices
        title_similarity = cosine_similarity(title_embeddings)
        desc_similarity = cosine_similarity(desc_embeddings)
        
        # Add similarity matrices to the output
        for i, rec in enumerate(recommendations):
            rec['title_similarities'] = {
                recommendations[j]['title']: float(title_similarity[i][j])
                for j in range(len(recommendations))
                if i != j
            }
            rec['description_similarities'] = {
                recommendations[j]['title']: float(desc_similarity[i][j])
                for j in range(len(recommendations))
                if i != j
            }

    print(f"\nRecommendation generation completed in {time.time() - start_time:.2f} seconds")
    return recommendations

def get_personalized_recommendations(preload_preferences=False):
    """Interactive function to get user preferences and return recommendations"""
    print("\nJob Recommendation System")
    print("------------------------")
    
    preferences = UserPreferences()
    
    if preload_preferences:
        # Preload default preferences
        preferences.location = (1.3521, 103.8198)  # Singapore coordinates
        preferences.location_name = "Singapore"
        preferences.max_distance_km = 10
        preferences.job_title = "Service Engineer"
        preferences.job_description = "Skilled in mechanical work, always hardworking, able to handle odd hours, able to do all jobs"
        preferences.weights = {
            'title_similarity': 0.3,
            'description_similarity': 0.2,
            'location_proximity': 0.2,
            'degree': 0.1,
            'pagerank': 0.1,
            'core_number': 0.1
        }
        print("Using preloaded preferences:")
        print(f"Location: {preferences.location_name}")
        print(f"Job Title: {preferences.job_title}")
        print(f"Job Description: {preferences.job_description}")
        print(f"Max Distance: {preferences.max_distance_km} km")
        print("Weights:", preferences.weights)
    else:
        use_location = input("Would you like to specify a location? (y/n): ").lower() == 'y'
        if use_location:
            location_input = int(input("Enter Postal Code (e.g., 123456): "))
            try:
                print("Geocoding location...")
                location = geolocator.geocode(location_input)
                if location:
                    preferences.location = (location.latitude, location.longitude)
                    preferences.location_name = location_input
                    print(f"Location found: {location.address}")
                else:
                    print("Location not found. Please enter coordinates manually.")
                    lat = float(input("Enter latitude (e.g., 1.3521 for Singapore): "))
                    lon = float(input("Enter longitude (e.g., 103.8198): "))
                    preferences.location = (lat, lon)
            except Exception as e:
                print(f"Error geocoding location: {str(e)}")
                print("Please enter coordinates manually.")
                lat = float(input("Enter latitude (e.g., 1.3521 for Singapore): "))
                lon = float(input("Enter longitude (e.g., 103.8198): "))
                preferences.location = (lat, lon)
            
            preferences.max_distance_km = float(input("Maximum distance in km (default 10): ") or 10)
        
        use_title = input("Would you like to specify a job title? (y/n): ").lower() == 'y'
        if use_title:
            preferences.job_title = input("Enter job title: ")
        
        use_desc = input("Would you like to specify a job description? (y/n): ").lower() == 'y'
        if use_desc:
            preferences.job_description = input("Enter job description: ")
            use_llm = input("Would you like to summarize your job description using our in-built AI matching tool? (y/n): ").lower() == 'y'
            if use_llm:
                print("Processing job description with JD_Matching_Tool...")
                summarized_desc = process_job_description_with_LLM(preferences.job_description)
                if summarized_desc:
                    print("\nSummarized job description:\n")
                    print(summarized_desc)
                    preferences.job_description = summarized_desc
        
        print("\nSet the importance for each of the following factors on a scale from 0 (not important) to 1 (very important):")
        if use_title:
            preferences.weights['title_similarity'] = float(input("How important is it for the job title to match your preferences? (default 0.3): ") or 0.3)
        if use_desc:
            preferences.weights['description_similarity'] = float(input("How important is it for the job description to match your skills and experience? (default 0.2): ") or 0.2)
        if use_location:
            preferences.weights['location_proximity'] = float(input("How important is it for the job to be near your preferred location? (default 0.2): ") or 0.2)

        preferences.weights['degree'] = float(input("How important is it for the job to be popular or well-connected within the network? (default 0.1): ") or 0.1)
        preferences.weights['pagerank'] = float(input("How important is it for the job to be influential within the network? (default 0.1): ") or 0.1)
        preferences.weights['core_number'] = float(input("How important is it for the job to be well-connected within its area or community? (default 0.1): ") or 0.1)
    
    print("\nGenerating recommendations...")
    recommendations = get_graph_based_recommendations(
        graph=graph,
        node_embeddings=node_embeddings,
        preferences=preferences
    )        
    
    print("\nTop Recommendations:")
    print("-------------------")
    for i, rec in enumerate(recommendations, 1):
        print(f"\n{i}. Company: {rec['company']}")
        print(f"   Job Title: {rec['title']}")
        print(f"   Job Type: {', '.join(rec['job_type'])}")
        if preferences.location:
            job_loc = ast.literal_eval(rec['location']) if isinstance(rec['location'], str) else rec['location']
            distance = geodesic(preferences.location, job_loc).kilometers
            print(f"   Distance: {distance:.1f} km")
        print(f"   Remote: {rec['is_remote']}")
        print(f"   Job URL: {rec['job_url']}")
        print(f"   Direct URL: {rec['job_url_direct']}")
        print("   Scores:")
        for component, score in rec['score_components'].items():
            print(f"      - {component}: {score:.3f}")
        print(f"   Final Score: {rec['final_score']:.3f}")
        
        # Print similarities with other recommendations
        if 'title_similarities' in rec:
            print("   Title Similarities:")
            for other_title, sim_score in rec['title_similarities'].items():
                print(f"      - {other_title}: {sim_score:.3f}")
        if 'description_similarities' in rec:
            print("   Description Similarities:")
            for other_title, sim_score in rec['description_similarities'].items():
                print(f"      - {other_title}: {sim_score:.3f}")

In [18]:
get_personalized_recommendations(preload_preferences=True)


Job Recommendation System
------------------------
Using preloaded preferences:
Location: Singapore
Job Title: Service Engineer
Job Description: Skilled in mechanical work, always hardworking, able to handle odd hours, able to do all jobs
Max Distance: 10 km
Weights: {'title_similarity': 0.3, 'description_similarity': 0.2, 'location_proximity': 0.2, 'degree': 0.1, 'pagerank': 0.1, 'core_number': 0.1}

Generating recommendations...

Starting recommendation generation...
Preparing embeddings...
Embeddings shape: (25610, 384)

Initializing search indices...

Building/Loading FAISS index...
Loading cached FAISS index...
FAISS index loaded successfully!

Building/Loading Annoy index...
Loading cached Annoy index...
Annoy index loaded successfully!

Preparing normalized embeddings...
Loading cached normalized embeddings...
Normalized embeddings loaded successfully!

Computing/Loading graph metrics...
Loading cached graph metrics...
Graph metrics loaded successfully!

Generating candidate no

In [4]:
input_desc = """
Responsibilities:
        - Analyze complex data sets
        - Build predictive models
        - Present insights to stakeholders
        - Develop data pipelines
        
        Qualifications:
        - Master's degree in Data Science, Statistics or related field
        - 2+ years experience in data science
        - Research background preferred
        - Published work is a plus
        
        Skills:
        Technical:
        - Statistical analysis
        - Deep learning frameworks
        - Natural language processing
        - Big data technologies
        - Python/R programming
        
        Non-technical:
        - Critical thinking
        - Research methodology
        - Business acumen
        - Communication skills
"""
input_embedding = get_job_description_embedding(input_desc, tokenizer, model, max_chunk_length=512, overlap=50, device=device)
similarities = np.dot(np.array(df['job_description_embedding'].tolist()), input_embedding)
closest_matches = df.iloc[np.argsort(similarities)[::-1][:5]]
print(closest_matches[['title', 'description']].to_string())
print("\nCosine Similarity Scores:")
for i, score in enumerate(np.sort(similarities)[::-1][:5]):
    print(f"Match {i+1}: {score:.4f}")

input_title = "2. Data Scientist"
title_similarities = np.dot(np.array(df['job_title_embedding'].tolist()), get_job_title_embedding(input_title, tokenizer, model, device=device))
title_matches = df.iloc[np.argsort(title_similarities)[::-1][:5]]
print(f"\nTitle matches for '{input_title}':")
print(title_matches['title'].to_string())
print("\nTitle Similarity Scores:")
for i, score in enumerate(np.sort(title_similarities)[::-1][:5]):
    print(f"Match {i+1}: {score:.4f}")


Generating job description embedding...
Processing chunk 1/1...
Averaging chunk embeddings...
                                                                    title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [8]:
df

Unnamed: 0,title,company,job_type,is_remote,description,address,cleaned_address,lat_long,model_response,id,job_url,job_url_direct,job_type_cleaned,job_type_encoded,job_title_embedding,job_description_embedding
0,Porter,PHOENIX OPCO PTE. LTD.,fulltime,False,Are you currently working in a service based e...,"Tras Street, #9-177 Union Building, 079025","Tras Street, Union Building, 079025","(1.27444651846065, 103.843929515239)",Responsibilities\n Ensure guest experiences...,0005a77c5af02f32,https://sg.indeed.com/viewjob?jk=0005a77c5af02f32,https://www.mycareersfuture.gov.sg/job/custome...,fulltime,"[0, 1, 0, 0, 0]","[-0.032369263, 0.037560612, -0.08516074, -0.04...","[0.036492854, 0.022954665, 0.038240157, 0.0592..."
1,Outlet Executive - Tan Tock Seng Hospital,Kopitiam Investment Pte Ltd,fulltime,False,Outlet Executive - Tan Tock Seng Hospital\nRes...,"1 Joo Koon Cir, #13-01 FairPrice Joo Koon, Sin...","1 Joo Koon Cir, FairPrice Joo Koon, Singapore...","(1.32476879097421, 103.674484690433)",Responsibilities\n Operations\n Support Out...,000989af12dd337f,https://sg.indeed.com/viewjob?jk=000989af12dd337f,https://kopitiam.recruiterpal.com/career/jobs/...,fulltime,"[0, 1, 0, 0, 0]","[-0.059394874, 0.07162735, -0.06818533, 0.0556...","[-0.007846934, 0.04081503, -0.030761063, -0.03..."
2,Sales Promoter,Oomph Pte. Ltd.,fulltime,False,"SALARY UP TO $4,000.00 (subject to experience)...","2 Alexandra Rd, #04-01 Delta House, Singapore ...","2 Alexandra Rd, Delta House, Singapore 159919","(1.27425442821763, 103.803711567804)",Aspect 1 Responsibilities\n Actively promote l...,000a01db7a6ccc16,https://sg.indeed.com/viewjob?jk=000a01db7a6ccc16,https://www.mycareersfuture.gov.sg/job/custome...,fulltime,"[0, 1, 0, 0, 0]","[-0.050802007, -0.00822947, -0.035635166, -0.0...","[-0.0066097165, 0.06563731, -0.025015522, -0.0..."
3,Quantity Surveyor,LBD ENGINEERING PTE. LTD.,fulltime,False,Job Description\n\n\n* Prepare and analyse cos...,"58A Sungei Kadut Loop, LBD Construction Group ...","58A Sungei Kadut Loop, LBD Construction Group ...","(1.40981215298244, 103.742781634928)",Responsibilities\n\n Prepare and analyze cost ...,000bdc93e15d1325,https://sg.indeed.com/viewjob?jk=000bdc93e15d1325,https://www.mycareersfuture.gov.sg/job/buildin...,fulltime,"[0, 1, 0, 0, 0]","[-0.027023997, 0.025126753, 0.0062498134, -0.0...","[-0.0005235376, 0.031939197, 0.05250165, 0.031..."
4,Cleaning Operations Assistant Supervisor,ECOCLEAN MAINTENANCE PTE. LTD.,fulltime,False,**Requirements**\n\n* at least 3 years of work...,"1 Yishun Industrial Street 1, #06-27 A'Posh Bi...","1 Yishun Industrial Street 1, A'Posh BizHub, ...","(1.43732110123747, 103.842085763701)","Responsibilities\n Respond to emergency calls,...",000be27bd990645d,https://sg.indeed.com/viewjob?jk=000be27bd990645d,https://www.mycareersfuture.gov.sg/job/others/...,fulltime,"[0, 1, 0, 0, 0]","[-0.0720216, -0.00010167251, 0.05262413, -0.02...","[0.0046085776, 0.013509961, 0.048739396, -0.04..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25605,AV Engineer (Based in Sri Lanka),Spectrum Audio Visual Pte Ltd,fulltime,False,**Job Objective:**\n\n* Work closely with manu...,"41 Kallang Pudding Rd, #07-00 Golden Wheel Bui...","41 Kallang Pudding Rd, Golden Wheel Building,...","(1.32922242996711, 103.878996925493)","Responsibilities\n\n System configuration, Tes...",fff44a23abbb24b8,https://sg.indeed.com/viewjob?jk=fff44a23abbb24b8,https://www.mycareersfuture.gov.sg/job/enginee...,fulltime,"[0, 1, 0, 0, 0]","[0.017765759, 0.0026095482, 0.021792056, -0.02...","[0.046365045, 0.021310633, 0.044983394, -0.029..."
25606,"Batch Operator, DC Operations",ITCAN Pte Ltd,"fulltime, contract",False,* DC operation\n* Provide Batch Scheduling / c...,"30 Cecil St, #18-08, Singapore 049712","30 Cecil St, , Singapore 049712","(1.27967894092681, 103.84859453785)",Responsibilities\n DC operation\n Provid...,fff7a157262619d5,https://sg.indeed.com/viewjob?jk=fff7a157262619d5,http://sg.indeed.com/job/batch-operator-dc-ope...,"fulltime,contract","[1, 1, 0, 0, 0]","[-0.056008913, 0.03407558, -0.03200176, -0.055...","[-0.077440314, 0.04420748, -0.020601498, -0.02..."
25607,Business Development Consultant (Recruitment),RECRUITPEDIA PTE. LTD.,fulltime,False,**We are a centrally located boutique recruitm...,"146 Robinson Rd, Singapore 068909","146 Robinson Rd, Singapore 068909","(1.27954844710103, 103.848970997768)","Job Highlights\n Startup environment, welcomin...",fff8178deb218454,https://sg.indeed.com/viewjob?jk=fff8178deb218454,https://www.mycareersfuture.gov.sg/job/admin/b...,fulltime,"[0, 1, 0, 0, 0]","[0.010034194, -0.013607246, -0.05233627, 0.050...","[0.00701115, 0.04573657, 0.02808972, 0.0588057..."
25608,TRUCK DRIVER,FE CONSULTANCY,fulltime,False,* Planning routes and travel schedules to ensu...,"21 Bukit Batok Cres, #07-84 Wcega Tower, Singa...","21 Bukit Batok Cres, Wcega Tower, Singapore 6...","(1.33696571508317, 103.759665479168)",Responsibilities\n Planning routes and travel ...,fffa91c5d0e3d752,https://sg.indeed.com/viewjob?jk=fffa91c5d0e3d752,https://www.mycareersfuture.gov.sg/job/general...,fulltime,"[0, 1, 0, 0, 0]","[-0.023009812, 0.056057077, -0.010153614, 0.00...","[0.026069092, -0.024150336, 0.0052029975, 0.07..."
