In [1]:
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from transformers import pipeline
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import time
import pandas as pd


### Define the core functions used to compute the overall job match score:
###### 1. calculate_cosine_similarity: uses BERT embeddings to measure how semantically similar a job description is to a user's profile.
###### 2. calculate_proximity_score: calculates the geographic proximity between the user's location and the job location, returning a score between 0 and 1.
###### 3. calculate_job_similarity_score: combines all factors (textual similarity, proximity, normalized salary, and job type) into a final weighted match score.


In [2]:
def calculate_cosine_similarity(about_text, description_text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    inputs_about = tokenizer(about_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs_description = tokenizer(description_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        output_about = model(**inputs_about)
        output_description = model(**inputs_description)

    embedding_about = output_about.last_hidden_state.mean(dim=1).squeeze()
    embedding_description = output_description.last_hidden_state.mean(dim=1).squeeze()

    similarity = 1 - cosine(embedding_about.numpy(), embedding_description.numpy())
    return similarity



def calculate_proximity_score(job_location, profile_location, max_distance=1000):

    geolocator = Nominatim(user_agent="geo_distance_calculator", timeout=10)

    time.sleep(5)
    job_coords = geolocator.geocode(job_location)
    profile_coords = geolocator.geocode(profile_location)

    if job_coords and profile_coords:
        distance = geodesic((profile_coords.latitude, profile_coords.longitude),
                            (job_coords.latitude, job_coords.longitude)).kilometers
        proximity_score = max(0, (max_distance - distance) / max_distance)  # ניקוד בין 0 ל-1
        return proximity_score
    else:
        print(f"Could not find coordinates for one of the locations: {job_location}, {profile_location}")
        return 0


def calculate_job_similarity_score(job_similarity_score,normalized_proximity_score, normalized_salary, job_type):


    if job_type.lower() == "full-time":
        job_type_score = 1.0

    else:
        job_type_score = 0.5


    total_score = (job_similarity_score * 0.6) + (normalized_proximity_score * 0.2) + (job_type_score * 0.1) + (normalized_salary * 0.1)
    return total_score


In [3]:
# פרופיל לדוגמה

profile_data = {
    'name':        ['John Doe'],
    'description': ["""
                    Experienced data scientist with a strong background in machine learning, deep learning, and data analysis.
                    Skilled in Python, R, and SQL, with experience in leading end-to-end data science projects.
                    Looking for opportunities in AI and data-driven solutions."""],
    'location':    ['New York, NY']
}

profile_df = pd.DataFrame(profile_data)


# רשימת משרות לדוגמה
jobs = [
    {"job_title": "Data Scientist", "job_description": "Looking for a data scientist with expertise in machine learning and Python.", "job_location": "New York, NY", "salary": 120000, "job_type": "Full-Time"},
    {"job_title": "Machine Learning Engineer", "job_description": "We need an ML engineer to build and deploy deep learning models.", "job_location": "San Francisco, CA", "salary": 130000, "job_type": "Full-Time"},
    {"job_title": "Data Analyst", "job_description": "Data analyst proficient in SQL and data visualization tools.", "job_location": "Chicago, IL", "salary": 90000, "job_type": "Part-Time"},
    {"job_title": "AI Researcher", "job_description": "Researcher in AI and deep learning applications for healthcare.", "job_location": "Boston, MA", "salary": 140000, "job_type": "Full-Time"},
    {"job_title": "Data Engineer", "job_description": "Expertise in building data pipelines and cloud-based solutions.", "job_location": "Austin, TX", "salary": 115000, "job_type": "Full-Time"},
    {"job_title": "Software Engineer", "job_description": "Developing scalable software solutions with Python and C++.", "job_location": "Seattle, WA", "salary": 110000, "job_type": "Full-Time"},
    {"job_title": "AI Consultant", "job_description": "Consulting firms on deploying AI solutions.", "job_location": "London, UK", "salary": 125000, "job_type": "Full-Time"},
    {"job_title": "Data Visualization Specialist", "job_description": "Specialist in creating visual dashboards and analytics.", "job_location": "Tel Aviv, Israel", "salary": 95000, "job_type": "Part-Time"},
    {"job_title": "Quantitative Analyst", "job_description": "Analyzing financial data and building predictive models.", "job_location": "Hong Kong", "salary": 150000, "job_type": "Full-Time"},
    {"job_title": "Deep Learning Engineer", "job_description": "Focus on training neural networks for NLP tasks.", "job_location": "Paris, France", "salary": 135000, "job_type": "Full-Time"},
    {"job_title": "Data Science Intern", "job_description": "Internship for students interested in data science.", "job_location": "Berlin, Germany", "salary": 30000, "job_type": "Part-Time"},
    {"job_title": "AI Product Manager", "job_description": "Managing AI-based products for consumer applications.", "job_location": "San Jose, CA", "salary": 145000, "job_type": "Full-Time"},
    {"job_title": "Data Architect", "job_description": "Designing and managing complex data architectures.", "job_location": "Toronto, Canada", "salary": 125000, "job_type": "Full-Time"},
    {"job_title": "Business Analyst", "job_description": "Analyzing business processes and recommending data-driven solutions.", "job_location": "Dubai, UAE", "salary": 100000, "job_type": "Full-Time"},
    {"job_title": "AI Developer", "job_description": "Developer with experience in AI tools and platforms.", "job_location": "Mumbai, India", "salary": 85000, "job_type": "Full-Time"},
    {"job_title": "Big Data Engineer", "job_description": "Processing and managing large-scale datasets.", "job_location": "Sydney, Australia", "salary": 120000, "job_type": "Full-Time"},
    {"job_title": "Cloud AI Engineer", "job_description": "Deploying AI solutions in cloud environments.", "job_location": "Tel Aviv, Israel", "salary": 130000, "job_type": "Full-Time"},
    {"job_title": "NLP Engineer", "job_description": "Working on language models and text analysis.", "job_location": "Rome, Italy", "salary": 110000, "job_type": "Full-Time"},
    {"job_title": "Robotics Engineer", "job_description": "Building AI solutions for robotics applications.", "job_location": "Tokyo, Japan", "salary": 140000, "job_type": "Full-Time"},
    {"job_title": "Data Operations Manager", "job_description": "Overseeing operations for data teams.", "job_location": "Bangkok, Thailand", "salary": 105000, "job_type": "Full-Time"},
]


In [4]:
import pandas as pd

jobs_df = pd.DataFrame(jobs)


max_salary = jobs_df['salary'].max()
jobs_df['normalized_salary'] = jobs_df['salary'] / max_salary

jobs_df['proximity_score'] = jobs_df.apply(lambda row: calculate_proximity_score(row['job_location'], profile_df.iloc[0]['location']), axis=1)
max_proximity_score = jobs_df['proximity_score'].max()
jobs_df['normalized_proximity_score'] = jobs_df['proximity_score'] / max_proximity_score

jobs_df['full_job_text'] = jobs_df['job_title'] + ' ' + jobs_df['job_description']

jobs_df['cosine_similarity_score'] = jobs_df.apply(lambda row: calculate_cosine_similarity(row['full_job_text'], profile_df.iloc[0]['description']), axis=1)

jobs_df['job_similarity_score'] = jobs_df.apply(lambda row: calculate_job_similarity_score(row['cosine_similarity_score'], row['normalized_proximity_score'], row['normalized_salary'], row['job_type']), axis=1)

jobs_df = jobs_df.sort_values(by='job_similarity_score', ascending=False)

jobs_df.head(10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Unnamed: 0,job_title,job_description,job_location,salary,job_type,normalized_salary,proximity_score,normalized_proximity_score,full_job_text,cosine_similarity_score,job_similarity_score
0,Data Scientist,Looking for a data scientist with expertise in...,"New York, NY",120000,Full-Time,0.8,1.0,1.0,Data Scientist Looking for a data scientist wi...,0.857836,0.894701
3,AI Researcher,Researcher in AI and deep learning application...,"Boston, MA",140000,Full-Time,0.933333,0.693913,0.693913,AI Researcher Researcher in AI and deep learni...,0.825797,0.827594
12,Data Architect,Designing and managing complex data architectu...,"Toronto, Canada",125000,Full-Time,0.833333,0.448753,0.448753,Data Architect Designing and managing complex ...,0.835844,0.77459
11,AI Product Manager,Managing AI-based products for consumer applic...,"San Jose, CA",145000,Full-Time,0.966667,0.0,0.0,AI Product Manager Managing AI-based products ...,0.811929,0.683824
4,Data Engineer,Expertise in building data pipelines and cloud...,"Austin, TX",115000,Full-Time,0.766667,0.0,0.0,Data Engineer Expertise in building data pipel...,0.840487,0.680959
16,Cloud AI Engineer,Deploying AI solutions in cloud environments.,"Tel Aviv, Israel",130000,Full-Time,0.866667,0.0,0.0,Cloud AI Engineer Deploying AI solutions in cl...,0.817911,0.677413
15,Big Data Engineer,Processing and managing large-scale datasets.,"Sydney, Australia",120000,Full-Time,0.8,0.0,0.0,Big Data Engineer Processing and managing larg...,0.827377,0.676426
1,Machine Learning Engineer,We need an ML engineer to build and deploy dee...,"San Francisco, CA",130000,Full-Time,0.866667,0.0,0.0,Machine Learning Engineer We need an ML engine...,0.805243,0.669812
9,Deep Learning Engineer,Focus on training neural networks for NLP tasks.,"Paris, France",135000,Full-Time,0.9,0.0,0.0,Deep Learning Engineer Focus on training neura...,0.799497,0.669698
8,Quantitative Analyst,Analyzing financial data and building predicti...,Hong Kong,150000,Full-Time,1.0,0.0,0.0,Quantitative Analyst Analyzing financial data ...,0.775121,0.665072
