In [12]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath('..'))

# Data Preprocessing Notebook
 
This notebook focuses on preparing raw job description data for analysis. It includes steps for cleaning, transforming, and structuring the data to make it suitable for machine learning models or other analytical techniques.


# 1. Datasets

In [None]:
import pandas as pd
from data_tools import DataFrameSummarizer

## 1.1 [Kaggle - Data Analyst Job Postings Google Search](https://www.kaggle.com/datasets/lukebarousse/data-analyst-job-postings-google-search)
*gsearch_jobs.csv*

In [None]:
gsearch_df = pd.read_csv('../kaggle_datasets/gsearch_jobs.csv', nrows=100)
gsearch_df.head(3)

Unnamed: 0.1,Unnamed: 0,index,title,company_name,location,via,description,extensions,job_id,thumbnail,...,commute_time,salary_pay,salary_rate,salary_avg,salary_min,salary_max,salary_hourly,salary_yearly,salary_standardized,description_tokens
0,0,0,Data Analyst (Remote),KGS Technology Group,Anywhere,via Built In,Job Description\n\nFull-Time...\n\nWe are look...,"['22 hours ago', 'Work from home', 'Full-time']",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgKFJlbW90ZS...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,"['tableau', 'javascript', 'python', 'power_bi'..."
1,1,1,Data Analyst with BA - Full Time,Talent Group,Anywhere,via LinkedIn,Qualifications :\n• 5+ Work experience as a da...,"['4 hours ago', 'Work from home', 'Full-time']",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3Qgd2l0aCBCQS...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,['sql']
2,2,2,Data Analyst,ClarisHealth,Anywhere,via LinkedIn,You may be ideal for this position if...\n• Yo...,"['6 hours ago', 'Work from home', 'Full-time',...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,"['postgres', 'mysql', 'postgresql', 'mongo', '..."


In [15]:
print(DataFrameSummarizer(gsearch_df[['title', 'description', 'extensions', 'description_tokens']]).get_summary())

DataFrame Overview
Shape: (100, 4)
Size: 400
Number of Columns: 4
Memory Usage: 0.75 MB

Columns:
- title
- description
- extensions
- description_tokens

Column Details

Column: title
  - Data Type: object
  - Number of Missing Values: 0
  - Percentage of Missing Values: 0.00%
  - Number of Unique Values: 55

Column: description
  - Data Type: object
  - Number of Missing Values: 0
  - Percentage of Missing Values: 0.00%
  - Number of Unique Values: 78

Column: extensions
  - Data Type: object
  - Number of Missing Values: 0
  - Percentage of Missing Values: 0.00%
  - Number of Unique Values: 69

Column: description_tokens
  - Data Type: object
  - Number of Missing Values: 0
  - Percentage of Missing Values: 0.00%
  - Number of Unique Values: 61



In [16]:
# This data comes with pre-mined skills for some fraction of the datapoints
for dt in gsearch_df[['description_tokens']][:5].iterrows():
    print(dt[-1][-1])

['tableau', 'javascript', 'python', 'power_bi', 'spss', 'sql', 'r', 'excel', 'sas']
['sql']
['postgres', 'mysql', 'postgresql', 'mongo', 'c', 't-sql', 'sql']
['aws', 'sql', 'sas']
[]


  print(dt[-1][-1])


In [17]:
# The job descriptions seem to be raw job descriptions. No pre-processing.
print(gsearch_df.sample().description.values[0])

Data analyst responsibilities include conducting full lifecycle analysis to include requirements, activities and design. As Data analysts will develop analysis and reporting capabilities.


## 1.2 [Kaggle - LinkedIn Job Postings](https://www.kaggle.com/datasets/arshkon/linkedin-job-postings)
*postings.csv*

In [18]:
linkedin_df = pd.read_csv('../kaggle_datasets/postings.csv', nrows=100)
linkedin_df.head(3)

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0


In [19]:
print(DataFrameSummarizer(linkedin_df[['title', 'description', 'skills_desc']]).get_summary())

DataFrame Overview
Shape: (100, 3)
Size: 300
Number of Columns: 3
Memory Usage: 0.47 MB

Columns:
- title
- description
- skills_desc

Column Details

Column: title
  - Data Type: object
  - Number of Missing Values: 0
  - Percentage of Missing Values: 0.00%
  - Number of Unique Values: 96

Column: description
  - Data Type: object
  - Number of Missing Values: 0
  - Percentage of Missing Values: 0.00%
  - Number of Unique Values: 100

Column: skills_desc
  - Data Type: object
  - Number of Missing Values: 90
  - Percentage of Missing Values: 90.00%
  - Number of Unique Values: 10
  - Unique Values: ['Requirements: \n\nWe are seeking a College or Graduate Student (can also be completed with school) with a focus in Planning, Architecture, Real Estate Development or Management or General Business. Must be able to work in an extremely fast paced environment and able to multitask and prioritize.'
 nan
 'We are currently accepting resumes for FOH - Asisstant Restaurant Management with a str

In [20]:
# The job descriptions seem to be raw job descriptions. No pre-processing.
print(linkedin_df.sample().description.values[0])

Sentinel Limousine of East Providence RI is a family owned business that was established in 1987. We are seeking a Second Shift Customer Service Representative for our local office. Job Description: The Customer Service / Reservationist is responsible for booking, coordinating and securing ground transportation itineraries for our corporate and retail clients. The Reservationist will work side-by-side with management and the chauffeur staff to ensure all reservations are accurate, assigned and in accordance with the company’s policy. Essential Functions: Responds to ground transportation requests from retail and corporate clients, affiliates, travel agents, and referral networks, concerning reservations arriving by email, telephone, fax, or through a central on-line reservation system. Creates and maintains reservation records, prepares and send confirmation and promptly processes any cancellations and modifications. Knowledge, Skills and Abilities: Advanced customer service and commun

## 1.3 [Kaggle - Data Analyst Jobs](https://www.kaggle.com/datasets/andrewmvd/data-analyst-jobs)
*DataAnalyst.csv* 

In [21]:
data_analyst = pd.read_csv('../kaggle_datasets/DataAnalyst.csv', nrows=100)
data_analyst.head(3)

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Easy Apply
0,0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice\n3.2,"New York, NY","New York, NY",201 to 500 employees,1961,Nonprofit Organization,Social Assistance,Non-Profit,$100 to $500 million (USD),-1,True
1,1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York\n3.8,"New York, NY","New York, NY",10000+ employees,1893,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,-1
2,2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace\n3.4,"New York, NY","New York, NY",1001 to 5000 employees,2003,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,GoDaddy,-1


In [22]:
print(DataFrameSummarizer(data_analyst[['Job Title', 'Job Description']]).get_summary())

DataFrame Overview
Shape: (100, 2)
Size: 200
Number of Columns: 2
Memory Usage: 0.46 MB

Columns:
- Job Title
- Job Description

Column Details

Column: Job Title
  - Data Type: object
  - Number of Missing Values: 0
  - Percentage of Missing Values: 0.00%
  - Number of Unique Values: 64

Column: Job Description
  - Data Type: object
  - Number of Missing Values: 0
  - Percentage of Missing Values: 0.00%
  - Number of Unique Values: 100



## 1.4 Preliminary Modeling Strategy

Given that all datasets include both job titles and descriptions, we can leverage both for data point representation. Here's a preliminary strategy for how we might approach this:

*   **Job Titles:** Initially, I'm considering using FastText to generate embeddings for job titles.

*   **Job Descriptions:** My plan is to use Gemini to extract key information, particularly skills, from the job descriptions. This extracted information will then inform the creation of a representation for the full description.

A core idea I'm exploring is the concept of **skill-focused embeddings** – embeddings that are specifically trained or fine-tuned to emphasize skills. Several options exist:

*   **BERT:** BERT offers both word and sentence tokenization capabilities. Word embeddings could be computationally intensive, but with access to Colab, I'm exploring attention mechanisms to mitigate this. Sentence embeddings offer a lighter alternative, although I'm still investigating how to effectively guide or steer sentence embeddings towards skill-centric representations.

# 2. Data Preparation
Before mining.

## 2.1 Moving Data to MongoDB


In [23]:
def insert_data_into_mongo():
    import utils.mongo_utils
    reload(utils.mongo_utils)
    # Import job data into MongoDB
    import sys
    import os
    from pathlib import Path

    # Add the parent directory to the Python path
    parent_dir = Path.cwd().parent.absolute()
    sys.path.append(str(parent_dir))


    from utils.mongo_utils import MongoImporter
    import utils.mongo_utils
    reload(utils.mongo_utils)

    # Set up parameters
    db_name = "rl_jobsdb"
    collection_name = "all_jobs"
    db_path = "../mongo_db/"
    directory_path = "../kaggle_datasets/"

    # Create the importer with standard MongoDB URI
    mongo_uri = "mongodb://localhost:27017/"

    # Create the importer
    importer = MongoImporter(
        mongo_uri=mongo_uri,
        db_name=db_name,
        collection_name=collection_name,
        db_path=db_path
    )

    try:
        # Import all files
        results = importer.import_all_files(directory_path)
        print("Import completed. Results:")
        for file_name, count in results.items():
            print(f"{file_name}: {count} documents inserted")
    except Exception as e:
        print(f"Error importing data: {str(e)}")
    finally:
        # Close the connection
        importer.close()
        
# insert_data_into_mongo()
# Run only when needing to import

## 2.2 Backing Database Up

In [24]:
def backup_mongodb():
    """
    Script to backup MongoDB database to a specified directory.
    """

    import sys
    import os
    import logging
    from pathlib import Path
    from utils.mongo_utils import MongoImporter as importer


    from utils.mongo_utils import backup_mongodb
    try:
    # Set up logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        logger = logging.getLogger(__name__)

        """Main function to backup MongoDB database."""
        # Default values
        db_name = "rl_jobsdb"
        backup_path = "../mongo_db/"

        logger.info(f"Backing up database '{db_name}' to {backup_path}")
        mongodump_path = r"C:\Program Files\MongoDB\Server\CMDTools_100.12.0\bin\mongodump.exe"
        # Perform the backup
        success = backup_mongodb(db_name, backup_path, mongodump_path)

        if success:
            logger.info("Backup completed successfully")
    finally:
        # Close the connection
        pass
# Run only when needing to backup
backup_mongodb()

ModuleNotFoundError: No module named 'utils'

## 2.3 Mining Data Using Gemini

## Rate Limits
Reate Limits
| Model                                                      | RPM      | TPM       | RPD    | Context Size                                                                                                                                                                                                               |
| ---------------------------------------------------------- | -------- | --------- | ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Gemini 2.5 Pro Experimental                                | 5        | 1,000,000 | 25     | 1,048,576 tokens[7](https://ai.google.dev/gemini-api/docs/models)                                                                                                                                                          |
| Gemini 2.5 Pro Preview                                     | --       | --        | --     | 1,048,576 tokens[7](https://ai.google.dev/gemini-api/docs/models)                                                                                                                                                          |
| Gemini 2.0 Flash                                           | 15       | 1,000,000 | 1,500  | 1,000,000 tokens[7](https://ai.google.dev/gemini-api/docs/models)                                                                                                                                                          |
| Gemini 2.0 Flash Experimental (including image generation) | 10       | 1,000,000 | 1,500  | 1,000,000 tokens[7](https://ai.google.dev/gemini-api/docs/models)                                                                                                                                                          |
| Gemini 2.0 Flash-Lite                                      | 30 green | 1,000,000 | 1,500  | 1,000,000 tokens[7](https://ai.google.dev/gemini-api/docs/models)                                                                                                                                                          |
| Gemini 2.0 Flash Thinking Experimental 01-21               | 10       | 4,000,000 | 1,500  | Unknown                                                                                                                                                                                                                    |
| Gemini 1.5 Flash                                           | 15       | 1,000,000 | 1,500  | Unknown                                                                                                                                                                                                                    |
| Gemini 1.5 Flash-8B                                        | 15       | 1,000,000 | 1,500  | Unknown                                                                                                                                                                                                                    |
| Gemini 1.5 Pro                                             | 2        | 32,000    | 50     | Up to **128K tokens** (standard), **1M tokens** (enterprise)[3](https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/)[5](https://blog.google/technology/ai/long-context-window-ai-models/) |
| Imagen 3                                                   | --       | --        | --     | Unknown                                                                                                                                                                                                                    |
| Gemma 3                                                    | 30       | 15,000    | 14,400 | Unknown                                                                                                                                                                                                                    |
| Gemini Embedding Experimental                              | --       | --        | --     | Unknown                                                                                                                                                                                                                    |

In [18]:
import pymongo
import numpy as np

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["rl_jobsdb"]
collection = db["all_jobs"]

# Get total number of documents in the collection
total_documents = collection.count_documents({})
print(f"Total documents in the collection: {total_documents}")

# Get a list of all fields in the collection
fields = []
for document in collection.find({}, limit=10):  # Limit to 10 documents for faster processing
    for key in document.keys():
        if key not in fields:
            fields.append(key)

print(f"\nFields in the collection: {fields}")

# Get data types of each field
field_data_types = {}
for field in fields:
    # Use aggregation pipeline to get distinct data types for each field
    data_types = collection.aggregate([
        {"$group": {
            "_id": {"$type": f"${field}"}
        }},
        {"$project": {
            "_id": 0,
            "type": "$_id"
        }}
    ])
    
    # Extract the data types from the aggregation result
    field_data_types[field] = [doc["type"] for doc in data_types]

print("\nData types of each field:")
for field, types in field_data_types.items():
    print(f"- {field}: {types}")

# Calculate average length of the "description" field
description_lengths = []
for document in collection.find({}, {"description": 1}):
    try:
        if "description" in document:
            description_lengths.append(len(document["description"]))
    except TypeError as e:
        print(f"Error processing document with id: {document.get('_id', 'Unknown ID')}")
        print(f"Description: {document.get('description', 'No description')}")

if description_lengths:
    average_description_length = np.mean(description_lengths)
    average_description_std = np.std(description_lengths)
    print(f"\nAverage length of the 'description' field: {average_description_length:.2f}")
    print(f"Standard deviation of the 'description' field: {average_description_std:.2f}")
else:
    print("\nNo 'description' fields found in the documents.")

client.close()

Total documents in the collection: 187394

Fields in the collection: ['_id', 'doc_id', 'source_file', 'original_index', 'job_title', 'description', 'metadata']

Data types of each field:
- _id: ['objectId']
- doc_id: ['string']
- source_file: ['string']
- original_index: ['int', 'long']
- job_title: ['string']
- description: ['double', 'string']
- metadata: ['object']
Error processing document with id: 67f2f280e93950df15184bd4
Description: nan
Error processing document with id: 67f2f281e93950df15185248
Description: nan
Error processing document with id: 67f2f284e93950df1518d514
Description: nan
Error processing document with id: 67f2f286e93950df15190ccf
Description: nan
Error processing document with id: 67f2f286e93950df1519102f
Description: nan
Error processing document with id: 67f2f289e93950df1519761e
Description: nan
Error processing document with id: 67f2f289e93950df15197728
Description: nan

Average length of the 'description' field: 3627.95
Standard deviation of the 'description

In [30]:
print(f"""
      Shooting for 500,000 tokens an average of 4 characters per token, yields {500000*4} characters.
      To be on the safe side, {2000000/(3627.95+2*2242.50):2f} would be the ideal numebr of jobs to send per request.
      For a total of {187394/245:2f} total requests.
      At 245 jobs per request, at 30 requests per minute, it would take about {(187394/245)*(1/30):.2f} minutes.
      """)


      Shooting for 500,000 tokens an average of 4 characters per token, yields 2000000 characters.
      To be on the safe side, 246.519453 would be the ideal numebr of jobs to send per request.
      For a total of 764.873469 total requests.
      At 245 jobs per request, at 30 requests per minute, it would take about 25.50 minutes.
      


In [1]:
# Testing a single request
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath('..'))
from utils import SkillExtractor
from pymongo import MongoClient
import os



# MongoDB setup
mongo_uri = os.getenv("MONGO_URI")
client = MongoClient(mongo_uri)
db = client.get_database("rl_jobsdb")
collection = db.get_collection("all_jobs")

# Fetch a single job document
two_jobs = list(collection.find({}, {"description": 1, "_id": 1}).limit(2))

# Convert _id to string for each job in two_jobs
for job in two_jobs:
    if "_id" in job:
        job["_id"] = str(job["_id"])

if two_jobs:
    # Call SkillExtractor with the single job
    print(SkillExtractor().extract_job_data(jobs=two_jobs))
else:
    print("No job found in the collection.")

client.close()

2025-04-06 19:14:12,892 - google_genai.models - INFO - AFC is enabled with max remote calls: 10.
2025-04-06 19:14:17,154 - httpx - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-lite:generateContent "HTTP/1.1 200 OK"
2025-04-06 19:14:17,159 - google_genai.models - INFO - AFC remote call 1 is done.
candidates=[Candidate(content=Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=None, function_response=None, inline_data=None, text='```json\n{\n    "67f2f276e93950df1517040f": {\n        "technical_skills": [\n            "SQL",\n            "R",\n            "Python",\n            "AWS",\n            "Caspio",\n            "database management",\n            "codebooks",\n            "interactive dashboards",\n            "data quality issues",\n            "Git/GitHub"\n        ],\n        "soft_skills": [\n            "data management skills",\n        

In [7]:
import os
from pymongo import MongoClient

# MongoDB setup
mongo_uri = os.getenv("MONGO_URI")
client = MongoClient(mongo_uri)
db = client.get_database("rl_jobsdb")
collection = db.get_collection("all_jobs")

# Fetch documents with empty technical_skills
empty_skills_docs = list(collection.find({"technical_skills": []}))

# Fetch documents with non-empty technical_skills
skills_exists = list(collection.find({"technical_skills": {"$exists": True}}))
non_empty_skills_docs = list(collection.find({"technical_skills": {"$exists": True, "$not": {"$size": 0}}}))


print(f"Documents with empty technical_skills {len(empty_skills_docs)}")
print(f"Documents with non-empty technical_skills {len(non_empty_skills_docs)}")
print(f"Documents with skills {len(skills_exists)}")

client.close()


Documents with empty technical_skills 967
Documents with non-empty technical_skills 11767
Documents with skills 12734


In [6]:
import os
from pymongo import MongoClient

# MongoDB setup
mongo_uri = os.getenv("MONGO_URI")
client = MongoClient(mongo_uri)
db = client.get_database("rl_jobsdb")
collection = db.get_collection("all_jobs")
# Fetch documents with non-empty technical_skills
no_skills = list(collection.find({"technical_skills": {"$exists": False}}))


print(f"Documents with empty technical_skills {len(no_skills)}")

client.close()


Documents with empty technical_skills 180373


# Vector Representations

## Extracting Skills for Training

Take note of the current structure of the database.

In [1]:
# Testing a single request
import sys
import os
from pymongo import MongoClient
# MongoDB setup
mongo_uri = os.getenv("MONGO_URI")
client = MongoClient(mongo_uri)
db = client.get_database("rl_jobsdb")
collection = db.get_collection("all_jobs")


# MongoDB setup
mongo_uri = os.getenv("MONGO_URI")
client = MongoClient(mongo_uri)
db = client.get_database("rl_jobsdb")
collection = db.get_collection("all_jobs")

# Fetch a single job document
single_job = list(collection.find({}).limit(1))[0]


client.close()

In [2]:
single_job.keys()

dict_keys(['_id', 'doc_id', 'source_file', 'original_index', 'job_title', 'description', 'metadata', 'experience_requirements', 'soft_skills', 'technical_skills'])

 Getting all technical skills from database

In [3]:
# Get all technical skills from database
import sys
import os
from pymongo import MongoClient

# MongoDB setup
mongo_uri = os.getenv("MONGO_URI")
client = MongoClient(mongo_uri)
db = client.get_database("rl_jobsdb")
collection = db.get_collection("all_jobs")

# Query for all documents that have technical_skills field
all_jobs = collection.find({"technical_skills": {"$exists": True}})

# Create set of all unique technical skills
all_technical_skills = set()
for job in all_jobs:
    if job.get('technical_skills'):
        all_technical_skills.update(job['technical_skills'])

print(f"Total unique technical skills found: {len(all_technical_skills)}")

client.close()

Total unique technical skills found: 18035


## Moving Data to Cloud

### Create DB for this

In [None]:
from pymongo import MongoClient

mongo_password = "mongodb_cbradna0920"
user = "cbradna"
db_name = "rl_jobsdb"
collection_name = "test_jobs"

mongo_uri = f"mongodb+srv://{user}:{mongo_password}@cluster0.zqzq6hs.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

client = MongoClient(mongo_uri)
db = client[db_name]  # Access or create the database
test_collection = db["test_collection"]

# Insert a test document
test_job = {
    "job_title": "Test Engineer",
    "description": "Testing stuff",
    "technical_skills": ["Python", "Selenium"]
}

try:
    result = test_collection.insert_one(test_job)
    print(f"Inserted document with id: {result.inserted_id}")
except Exception as e:
    print(f"An error occurred: {e}")

# List collections to confirm creation
print("Collections in database:")
print(db.list_collection_names())

client.close()


Inserted document with id: 680503347c1e6882be225c18
Collections in database:
['test_collection']


### Move Local Data to Remote

In [3]:
from preprocessing_utils.mongo_utils import move_jobs_with_skills
from pymongo import MongoClient
import certifi

# Remote hosted MongoDB credentials
mongo_password = "mongodb_cbradna0920"
user = "cbradna"
db_name = "rl_jobsdb"
source_collection = "all_jobs"
target_collection = "jobs_text"

# Construct the MongoDB connection string for target
target_mongo_uri = f"mongodb+srv://{user}:{mongo_password}@cluster0.zqzq6hs.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Move from local to remote database
moved_count = move_jobs_with_skills(
    batch_size=100,
    source_mongo_uri="mongodb://localhost:27017/",
    target_mongo_uri=target_mongo_uri,
    db_name=db_name,
    source_collection_name=source_collection,
    target_collection_name=target_collection,
    use_ssl=True
)
print(f"Moved {moved_count} jobs")

2025-04-20 10:42:04,521 - preprocessing_utils.mongo_utils - INFO - Using separate MongoDB connections for source and target
2025-04-20 10:42:04,888 - preprocessing_utils.mongo_utils - INFO - Connected to target MongoDB: mongodb+srv://cbradna:mongodb_cbradna0920@cluster0.zqzq6hs.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0, database: rl_jobsdb, collection: jobs_text
2025-04-20 10:42:04,891 - preprocessing_utils.mongo_utils - INFO - Initialized JobIterator with batch size 100
2025-04-20 10:42:04,891 - preprocessing_utils.mongo_utils - INFO - Connected to source MongoDB: mongodb://localhost:27017/, database: rl_jobsdb, collection: all_jobs
2025-04-20 10:42:04,892 - preprocessing_utils.mongo_utils - INFO - Query: {'technical_skills': {'$exists': True, '$ne': []}}
2025-04-20 10:42:04,895 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs


Batch 1: 100%|██████████| 100/100 [00:04<00:00, 24.75it/s]

2025-04-20 10:42:08,937 - preprocessing_utils.mongo_utils - INFO - Batch 1: Moved 100/100 jobs. Total moved: 100
2025-04-20 10:42:08,941 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 2: 100%|██████████| 100/100 [00:03<00:00, 25.76it/s]

2025-04-20 10:42:12,824 - preprocessing_utils.mongo_utils - INFO - Batch 2: Moved 100/100 jobs. Total moved: 200
2025-04-20 10:42:12,826 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 3: 100%|██████████| 100/100 [00:04<00:00, 24.57it/s]

2025-04-20 10:42:16,898 - preprocessing_utils.mongo_utils - INFO - Batch 3: Moved 100/100 jobs. Total moved: 300
2025-04-20 10:42:16,900 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 4: 100%|██████████| 100/100 [00:03<00:00, 26.33it/s]

2025-04-20 10:42:20,701 - preprocessing_utils.mongo_utils - INFO - Batch 4: Moved 100/100 jobs. Total moved: 400
2025-04-20 10:42:20,703 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 5: 100%|██████████| 100/100 [00:04<00:00, 23.03it/s]

2025-04-20 10:42:25,049 - preprocessing_utils.mongo_utils - INFO - Batch 5: Moved 100/100 jobs. Total moved: 500
2025-04-20 10:42:25,051 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 6: 100%|██████████| 100/100 [00:03<00:00, 25.87it/s]

2025-04-20 10:42:28,919 - preprocessing_utils.mongo_utils - INFO - Batch 6: Moved 100/100 jobs. Total moved: 600
2025-04-20 10:42:28,921 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 7: 100%|██████████| 100/100 [00:03<00:00, 26.17it/s]

2025-04-20 10:42:32,744 - preprocessing_utils.mongo_utils - INFO - Batch 7: Moved 100/100 jobs. Total moved: 700
2025-04-20 10:42:32,746 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 8: 100%|██████████| 100/100 [00:03<00:00, 25.81it/s]

2025-04-20 10:42:36,623 - preprocessing_utils.mongo_utils - INFO - Batch 8: Moved 100/100 jobs. Total moved: 800
2025-04-20 10:42:36,625 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 9: 100%|██████████| 100/100 [00:04<00:00, 24.07it/s]

2025-04-20 10:42:40,783 - preprocessing_utils.mongo_utils - INFO - Batch 9: Moved 100/100 jobs. Total moved: 900
2025-04-20 10:42:40,785 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 10: 100%|██████████| 100/100 [00:03<00:00, 26.19it/s]

2025-04-20 10:42:44,605 - preprocessing_utils.mongo_utils - INFO - Batch 10: Moved 100/100 jobs. Total moved: 1000
2025-04-20 10:42:44,607 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 11: 100%|██████████| 100/100 [00:03<00:00, 25.54it/s]

2025-04-20 10:42:48,524 - preprocessing_utils.mongo_utils - INFO - Batch 11: Moved 100/100 jobs. Total moved: 1100
2025-04-20 10:42:48,526 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 12: 100%|██████████| 100/100 [00:03<00:00, 25.97it/s]

2025-04-20 10:42:52,379 - preprocessing_utils.mongo_utils - INFO - Batch 12: Moved 100/100 jobs. Total moved: 1200
2025-04-20 10:42:52,382 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 13: 100%|██████████| 100/100 [00:03<00:00, 26.21it/s]

2025-04-20 10:42:56,200 - preprocessing_utils.mongo_utils - INFO - Batch 13: Moved 100/100 jobs. Total moved: 1300
2025-04-20 10:42:56,203 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 14: 100%|██████████| 100/100 [00:04<00:00, 24.65it/s]

2025-04-20 10:43:00,262 - preprocessing_utils.mongo_utils - INFO - Batch 14: Moved 100/100 jobs. Total moved: 1400
2025-04-20 10:43:00,264 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 15: 100%|██████████| 100/100 [00:03<00:00, 25.89it/s]

2025-04-20 10:43:04,129 - preprocessing_utils.mongo_utils - INFO - Batch 15: Moved 100/100 jobs. Total moved: 1500
2025-04-20 10:43:04,132 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 16: 100%|██████████| 100/100 [00:03<00:00, 26.23it/s]

2025-04-20 10:43:07,946 - preprocessing_utils.mongo_utils - INFO - Batch 16: Moved 100/100 jobs. Total moved: 1600
2025-04-20 10:43:07,949 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 17: 100%|██████████| 100/100 [00:04<00:00, 24.59it/s]

2025-04-20 10:43:12,017 - preprocessing_utils.mongo_utils - INFO - Batch 17: Moved 100/100 jobs. Total moved: 1700
2025-04-20 10:43:12,019 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 18: 100%|██████████| 100/100 [00:03<00:00, 26.55it/s]

2025-04-20 10:43:15,787 - preprocessing_utils.mongo_utils - INFO - Batch 18: Moved 100/100 jobs. Total moved: 1800
2025-04-20 10:43:15,789 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 19: 100%|██████████| 100/100 [00:03<00:00, 26.19it/s]

2025-04-20 10:43:19,608 - preprocessing_utils.mongo_utils - INFO - Batch 19: Moved 100/100 jobs. Total moved: 1900
2025-04-20 10:43:19,611 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 20: 100%|██████████| 100/100 [00:03<00:00, 25.84it/s]

2025-04-20 10:43:23,484 - preprocessing_utils.mongo_utils - INFO - Batch 20: Moved 100/100 jobs. Total moved: 2000
2025-04-20 10:43:23,486 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 21: 100%|██████████| 100/100 [00:03<00:00, 26.49it/s]

2025-04-20 10:43:27,264 - preprocessing_utils.mongo_utils - INFO - Batch 21: Moved 100/100 jobs. Total moved: 2100
2025-04-20 10:43:27,266 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 22: 100%|██████████| 100/100 [00:03<00:00, 25.63it/s]

2025-04-20 10:43:31,169 - preprocessing_utils.mongo_utils - INFO - Batch 22: Moved 100/100 jobs. Total moved: 2200
2025-04-20 10:43:31,172 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 23: 100%|██████████| 100/100 [00:03<00:00, 25.99it/s]

2025-04-20 10:43:35,023 - preprocessing_utils.mongo_utils - INFO - Batch 23: Moved 100/100 jobs. Total moved: 2300
2025-04-20 10:43:35,025 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 24: 100%|██████████| 100/100 [00:03<00:00, 26.38it/s]

2025-04-20 10:43:38,819 - preprocessing_utils.mongo_utils - INFO - Batch 24: Moved 100/100 jobs. Total moved: 2400
2025-04-20 10:43:38,821 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 25: 100%|██████████| 100/100 [00:04<00:00, 23.99it/s]

2025-04-20 10:43:42,992 - preprocessing_utils.mongo_utils - INFO - Batch 25: Moved 100/100 jobs. Total moved: 2500
2025-04-20 10:43:42,996 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 26: 100%|██████████| 100/100 [00:03<00:00, 26.50it/s]

2025-04-20 10:43:46,770 - preprocessing_utils.mongo_utils - INFO - Batch 26: Moved 100/100 jobs. Total moved: 2600
2025-04-20 10:43:46,773 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 27: 100%|██████████| 100/100 [00:03<00:00, 25.48it/s]

2025-04-20 10:43:50,700 - preprocessing_utils.mongo_utils - INFO - Batch 27: Moved 100/100 jobs. Total moved: 2700
2025-04-20 10:43:50,703 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 28: 100%|██████████| 100/100 [00:03<00:00, 25.79it/s]

2025-04-20 10:43:54,583 - preprocessing_utils.mongo_utils - INFO - Batch 28: Moved 100/100 jobs. Total moved: 2800
2025-04-20 10:43:54,585 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 29: 100%|██████████| 100/100 [00:03<00:00, 26.30it/s]

2025-04-20 10:43:58,389 - preprocessing_utils.mongo_utils - INFO - Batch 29: Moved 100/100 jobs. Total moved: 2900
2025-04-20 10:43:58,393 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 30: 100%|██████████| 100/100 [00:03<00:00, 25.15it/s]

2025-04-20 10:44:02,370 - preprocessing_utils.mongo_utils - INFO - Batch 30: Moved 100/100 jobs. Total moved: 3000
2025-04-20 10:44:02,374 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 31: 100%|██████████| 100/100 [00:04<00:00, 24.49it/s]

2025-04-20 10:44:06,460 - preprocessing_utils.mongo_utils - INFO - Batch 31: Moved 100/100 jobs. Total moved: 3100
2025-04-20 10:44:06,464 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 32: 100%|██████████| 100/100 [00:03<00:00, 25.21it/s]

2025-04-20 10:44:10,433 - preprocessing_utils.mongo_utils - INFO - Batch 32: Moved 100/100 jobs. Total moved: 3200
2025-04-20 10:44:10,436 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 33: 100%|██████████| 100/100 [00:03<00:00, 25.56it/s]

2025-04-20 10:44:14,352 - preprocessing_utils.mongo_utils - INFO - Batch 33: Moved 100/100 jobs. Total moved: 3300
2025-04-20 10:44:14,355 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 34: 100%|██████████| 100/100 [00:04<00:00, 24.86it/s]

2025-04-20 10:44:18,381 - preprocessing_utils.mongo_utils - INFO - Batch 34: Moved 100/100 jobs. Total moved: 3400
2025-04-20 10:44:18,385 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 35: 100%|██████████| 100/100 [00:03<00:00, 25.41it/s]

2025-04-20 10:44:22,322 - preprocessing_utils.mongo_utils - INFO - Batch 35: Moved 100/100 jobs. Total moved: 3500
2025-04-20 10:44:22,327 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 36: 100%|██████████| 100/100 [00:03<00:00, 25.93it/s]

2025-04-20 10:44:26,185 - preprocessing_utils.mongo_utils - INFO - Batch 36: Moved 100/100 jobs. Total moved: 3600
2025-04-20 10:44:26,188 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 37: 100%|██████████| 100/100 [00:04<00:00, 23.18it/s]

2025-04-20 10:44:30,505 - preprocessing_utils.mongo_utils - INFO - Batch 37: Moved 100/100 jobs. Total moved: 3700
2025-04-20 10:44:30,508 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 38: 100%|██████████| 100/100 [00:04<00:00, 23.14it/s]

2025-04-20 10:44:34,832 - preprocessing_utils.mongo_utils - INFO - Batch 38: Moved 100/100 jobs. Total moved: 3800
2025-04-20 10:44:34,836 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 39: 100%|██████████| 100/100 [00:03<00:00, 25.76it/s]

2025-04-20 10:44:38,720 - preprocessing_utils.mongo_utils - INFO - Batch 39: Moved 100/100 jobs. Total moved: 3900
2025-04-20 10:44:38,723 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 40: 100%|██████████| 100/100 [00:03<00:00, 25.51it/s]

2025-04-20 10:44:42,644 - preprocessing_utils.mongo_utils - INFO - Batch 40: Moved 100/100 jobs. Total moved: 4000
2025-04-20 10:44:42,647 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 41: 100%|██████████| 100/100 [00:04<00:00, 21.54it/s]

2025-04-20 10:44:47,292 - preprocessing_utils.mongo_utils - INFO - Batch 41: Moved 100/100 jobs. Total moved: 4100
2025-04-20 10:44:47,296 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 42: 100%|██████████| 100/100 [00:03<00:00, 25.88it/s]

2025-04-20 10:44:51,162 - preprocessing_utils.mongo_utils - INFO - Batch 42: Moved 100/100 jobs. Total moved: 4200
2025-04-20 10:44:51,165 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 43: 100%|██████████| 100/100 [00:03<00:00, 26.35it/s]

2025-04-20 10:44:54,963 - preprocessing_utils.mongo_utils - INFO - Batch 43: Moved 100/100 jobs. Total moved: 4300
2025-04-20 10:44:54,966 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 44: 100%|██████████| 100/100 [00:04<00:00, 24.39it/s]

2025-04-20 10:44:59,068 - preprocessing_utils.mongo_utils - INFO - Batch 44: Moved 100/100 jobs. Total moved: 4400
2025-04-20 10:44:59,073 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 45: 100%|██████████| 100/100 [00:04<00:00, 23.55it/s]

2025-04-20 10:45:03,320 - preprocessing_utils.mongo_utils - INFO - Batch 45: Moved 100/100 jobs. Total moved: 4500
2025-04-20 10:45:03,324 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 46: 100%|██████████| 100/100 [00:04<00:00, 24.60it/s]

2025-04-20 10:45:07,392 - preprocessing_utils.mongo_utils - INFO - Batch 46: Moved 100/100 jobs. Total moved: 4600
2025-04-20 10:45:07,395 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 47: 100%|██████████| 100/100 [00:04<00:00, 24.20it/s]

2025-04-20 10:45:11,531 - preprocessing_utils.mongo_utils - INFO - Batch 47: Moved 100/100 jobs. Total moved: 4700
2025-04-20 10:45:11,535 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 48: 100%|██████████| 100/100 [00:03<00:00, 25.80it/s]

2025-04-20 10:45:15,413 - preprocessing_utils.mongo_utils - INFO - Batch 48: Moved 100/100 jobs. Total moved: 4800
2025-04-20 10:45:15,418 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 49: 100%|██████████| 100/100 [00:03<00:00, 26.51it/s]

2025-04-20 10:45:19,192 - preprocessing_utils.mongo_utils - INFO - Batch 49: Moved 100/100 jobs. Total moved: 4900
2025-04-20 10:45:19,198 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 50: 100%|██████████| 100/100 [00:04<00:00, 24.62it/s]

2025-04-20 10:45:23,262 - preprocessing_utils.mongo_utils - INFO - Batch 50: Moved 100/100 jobs. Total moved: 5000
2025-04-20 10:45:23,266 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 51: 100%|██████████| 100/100 [00:04<00:00, 20.72it/s]

2025-04-20 10:45:28,094 - preprocessing_utils.mongo_utils - INFO - Batch 51: Moved 100/100 jobs. Total moved: 5100
2025-04-20 10:45:28,098 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 52: 100%|██████████| 100/100 [00:04<00:00, 24.51it/s]

2025-04-20 10:45:32,182 - preprocessing_utils.mongo_utils - INFO - Batch 52: Moved 100/100 jobs. Total moved: 5200
2025-04-20 10:45:32,186 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 53: 100%|██████████| 100/100 [00:03<00:00, 25.27it/s]

2025-04-20 10:45:36,146 - preprocessing_utils.mongo_utils - INFO - Batch 53: Moved 100/100 jobs. Total moved: 5300
2025-04-20 10:45:36,150 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 54: 100%|██████████| 100/100 [00:03<00:00, 25.64it/s]

2025-04-20 10:45:40,051 - preprocessing_utils.mongo_utils - INFO - Batch 54: Moved 100/100 jobs. Total moved: 5400
2025-04-20 10:45:40,056 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 55: 100%|██████████| 100/100 [00:03<00:00, 26.46it/s]

2025-04-20 10:45:43,836 - preprocessing_utils.mongo_utils - INFO - Batch 55: Moved 100/100 jobs. Total moved: 5500
2025-04-20 10:45:43,840 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 56: 100%|██████████| 100/100 [00:04<00:00, 24.61it/s]

2025-04-20 10:45:47,906 - preprocessing_utils.mongo_utils - INFO - Batch 56: Moved 100/100 jobs. Total moved: 5600
2025-04-20 10:45:47,910 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 57: 100%|██████████| 100/100 [00:03<00:00, 25.02it/s]

2025-04-20 10:45:51,908 - preprocessing_utils.mongo_utils - INFO - Batch 57: Moved 100/100 jobs. Total moved: 5700
2025-04-20 10:45:51,912 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 58: 100%|██████████| 100/100 [00:04<00:00, 24.37it/s]

2025-04-20 10:45:56,018 - preprocessing_utils.mongo_utils - INFO - Batch 58: Moved 100/100 jobs. Total moved: 5800
2025-04-20 10:45:56,023 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 59: 100%|██████████| 100/100 [00:04<00:00, 23.83it/s]

2025-04-20 10:46:00,223 - preprocessing_utils.mongo_utils - INFO - Batch 59: Moved 100/100 jobs. Total moved: 5900
2025-04-20 10:46:00,229 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 60: 100%|██████████| 100/100 [00:04<00:00, 23.28it/s]

2025-04-20 10:46:04,527 - preprocessing_utils.mongo_utils - INFO - Batch 60: Moved 100/100 jobs. Total moved: 6000
2025-04-20 10:46:04,534 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 61: 100%|██████████| 100/100 [00:04<00:00, 23.16it/s]

2025-04-20 10:46:08,855 - preprocessing_utils.mongo_utils - INFO - Batch 61: Moved 100/100 jobs. Total moved: 6100
2025-04-20 10:46:08,863 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 62: 100%|██████████| 100/100 [00:04<00:00, 21.64it/s]

2025-04-20 10:46:13,486 - preprocessing_utils.mongo_utils - INFO - Batch 62: Moved 100/100 jobs. Total moved: 6200
2025-04-20 10:46:13,493 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 63: 100%|██████████| 100/100 [00:04<00:00, 21.23it/s]

2025-04-20 10:46:18,205 - preprocessing_utils.mongo_utils - INFO - Batch 63: Moved 100/100 jobs. Total moved: 6300
2025-04-20 10:46:18,213 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 64: 100%|██████████| 100/100 [00:04<00:00, 24.00it/s]

2025-04-20 10:46:22,381 - preprocessing_utils.mongo_utils - INFO - Batch 64: Moved 100/100 jobs. Total moved: 6400
2025-04-20 10:46:22,387 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 65: 100%|██████████| 100/100 [00:04<00:00, 24.02it/s]

2025-04-20 10:46:26,552 - preprocessing_utils.mongo_utils - INFO - Batch 65: Moved 100/100 jobs. Total moved: 6500
2025-04-20 10:46:26,557 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 66: 100%|██████████| 100/100 [00:03<00:00, 26.48it/s]

2025-04-20 10:46:30,335 - preprocessing_utils.mongo_utils - INFO - Batch 66: Moved 100/100 jobs. Total moved: 6600
2025-04-20 10:46:30,339 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 67: 100%|██████████| 100/100 [00:03<00:00, 26.39it/s]

2025-04-20 10:46:34,131 - preprocessing_utils.mongo_utils - INFO - Batch 67: Moved 100/100 jobs. Total moved: 6700
2025-04-20 10:46:34,136 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 68: 100%|██████████| 100/100 [00:03<00:00, 26.14it/s]

2025-04-20 10:46:37,965 - preprocessing_utils.mongo_utils - INFO - Batch 68: Moved 100/100 jobs. Total moved: 6800
2025-04-20 10:46:37,973 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 69: 100%|██████████| 100/100 [00:04<00:00, 24.63it/s]

2025-04-20 10:46:42,036 - preprocessing_utils.mongo_utils - INFO - Batch 69: Moved 100/100 jobs. Total moved: 6900
2025-04-20 10:46:42,042 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 70: 100%|██████████| 100/100 [00:03<00:00, 26.10it/s]

2025-04-20 10:46:45,876 - preprocessing_utils.mongo_utils - INFO - Batch 70: Moved 100/100 jobs. Total moved: 7000
2025-04-20 10:46:45,881 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 71: 100%|██████████| 100/100 [00:04<00:00, 24.35it/s]

2025-04-20 10:46:49,990 - preprocessing_utils.mongo_utils - INFO - Batch 71: Moved 100/100 jobs. Total moved: 7100
2025-04-20 10:46:49,996 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 72: 100%|██████████| 100/100 [00:04<00:00, 24.57it/s]

2025-04-20 10:46:54,069 - preprocessing_utils.mongo_utils - INFO - Batch 72: Moved 100/100 jobs. Total moved: 7200
2025-04-20 10:46:54,074 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 73: 100%|██████████| 100/100 [00:03<00:00, 25.83it/s]

2025-04-20 10:46:57,948 - preprocessing_utils.mongo_utils - INFO - Batch 73: Moved 100/100 jobs. Total moved: 7300
2025-04-20 10:46:57,951 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 74: 100%|██████████| 100/100 [00:03<00:00, 25.09it/s]

2025-04-20 10:47:01,939 - preprocessing_utils.mongo_utils - INFO - Batch 74: Moved 100/100 jobs. Total moved: 7400
2025-04-20 10:47:01,943 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 75: 100%|██████████| 100/100 [00:03<00:00, 25.68it/s]

2025-04-20 10:47:05,840 - preprocessing_utils.mongo_utils - INFO - Batch 75: Moved 100/100 jobs. Total moved: 7500
2025-04-20 10:47:05,843 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 76: 100%|██████████| 100/100 [00:03<00:00, 25.49it/s]

2025-04-20 10:47:09,769 - preprocessing_utils.mongo_utils - INFO - Batch 76: Moved 100/100 jobs. Total moved: 7600
2025-04-20 10:47:09,773 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 77: 100%|██████████| 100/100 [00:04<00:00, 22.70it/s]

2025-04-20 10:47:14,181 - preprocessing_utils.mongo_utils - INFO - Batch 77: Moved 100/100 jobs. Total moved: 7700
2025-04-20 10:47:14,184 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 78: 100%|██████████| 100/100 [00:03<00:00, 25.11it/s]

2025-04-20 10:47:18,168 - preprocessing_utils.mongo_utils - INFO - Batch 78: Moved 100/100 jobs. Total moved: 7800
2025-04-20 10:47:18,172 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 79: 100%|██████████| 100/100 [00:03<00:00, 25.16it/s]

2025-04-20 10:47:22,150 - preprocessing_utils.mongo_utils - INFO - Batch 79: Moved 100/100 jobs. Total moved: 7900
2025-04-20 10:47:22,153 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 80: 100%|██████████| 100/100 [00:03<00:00, 25.99it/s]

2025-04-20 10:47:26,003 - preprocessing_utils.mongo_utils - INFO - Batch 80: Moved 100/100 jobs. Total moved: 8000
2025-04-20 10:47:26,007 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 81: 100%|██████████| 100/100 [00:03<00:00, 26.29it/s]

2025-04-20 10:47:29,812 - preprocessing_utils.mongo_utils - INFO - Batch 81: Moved 100/100 jobs. Total moved: 8100
2025-04-20 10:47:29,816 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 82: 100%|██████████| 100/100 [00:03<00:00, 25.80it/s]

2025-04-20 10:47:33,694 - preprocessing_utils.mongo_utils - INFO - Batch 82: Moved 100/100 jobs. Total moved: 8200
2025-04-20 10:47:33,697 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 83: 100%|██████████| 100/100 [00:03<00:00, 25.80it/s]

2025-04-20 10:47:37,577 - preprocessing_utils.mongo_utils - INFO - Batch 83: Moved 100/100 jobs. Total moved: 8300
2025-04-20 10:47:37,581 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 84: 100%|██████████| 100/100 [00:03<00:00, 25.69it/s]

2025-04-20 10:47:41,474 - preprocessing_utils.mongo_utils - INFO - Batch 84: Moved 100/100 jobs. Total moved: 8400
2025-04-20 10:47:41,479 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 85: 100%|██████████| 100/100 [00:03<00:00, 26.12it/s]

2025-04-20 10:47:45,309 - preprocessing_utils.mongo_utils - INFO - Batch 85: Moved 100/100 jobs. Total moved: 8500
2025-04-20 10:47:45,313 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 86: 100%|██████████| 100/100 [00:03<00:00, 25.10it/s]

2025-04-20 10:47:49,298 - preprocessing_utils.mongo_utils - INFO - Batch 86: Moved 100/100 jobs. Total moved: 8600
2025-04-20 10:47:49,302 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 87: 100%|██████████| 100/100 [00:03<00:00, 26.00it/s]

2025-04-20 10:47:53,151 - preprocessing_utils.mongo_utils - INFO - Batch 87: Moved 100/100 jobs. Total moved: 8700
2025-04-20 10:47:53,155 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 88: 100%|██████████| 100/100 [00:03<00:00, 25.35it/s]

2025-04-20 10:47:57,101 - preprocessing_utils.mongo_utils - INFO - Batch 88: Moved 100/100 jobs. Total moved: 8800
2025-04-20 10:47:57,105 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 89: 100%|██████████| 100/100 [00:04<00:00, 23.89it/s]

2025-04-20 10:48:01,291 - preprocessing_utils.mongo_utils - INFO - Batch 89: Moved 100/100 jobs. Total moved: 8900
2025-04-20 10:48:01,296 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 90: 100%|██████████| 100/100 [00:03<00:00, 25.23it/s]

2025-04-20 10:48:05,262 - preprocessing_utils.mongo_utils - INFO - Batch 90: Moved 100/100 jobs. Total moved: 9000
2025-04-20 10:48:05,266 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 91: 100%|██████████| 100/100 [00:04<00:00, 21.00it/s]

2025-04-20 10:48:10,030 - preprocessing_utils.mongo_utils - INFO - Batch 91: Moved 100/100 jobs. Total moved: 9100
2025-04-20 10:48:10,034 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 92: 100%|██████████| 100/100 [00:03<00:00, 25.48it/s]

2025-04-20 10:48:13,962 - preprocessing_utils.mongo_utils - INFO - Batch 92: Moved 100/100 jobs. Total moved: 9200
2025-04-20 10:48:13,964 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 93: 100%|██████████| 100/100 [00:04<00:00, 22.79it/s]

2025-04-20 10:48:18,354 - preprocessing_utils.mongo_utils - INFO - Batch 93: Moved 100/100 jobs. Total moved: 9300
2025-04-20 10:48:18,357 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 94: 100%|██████████| 100/100 [00:03<00:00, 25.46it/s]

2025-04-20 10:48:22,287 - preprocessing_utils.mongo_utils - INFO - Batch 94: Moved 100/100 jobs. Total moved: 9400
2025-04-20 10:48:22,290 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 95: 100%|██████████| 100/100 [00:03<00:00, 25.69it/s]

2025-04-20 10:48:26,185 - preprocessing_utils.mongo_utils - INFO - Batch 95: Moved 100/100 jobs. Total moved: 9500
2025-04-20 10:48:26,189 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 96: 100%|██████████| 100/100 [00:03<00:00, 26.21it/s]

2025-04-20 10:48:30,006 - preprocessing_utils.mongo_utils - INFO - Batch 96: Moved 100/100 jobs. Total moved: 9600
2025-04-20 10:48:30,010 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 97: 100%|██████████| 100/100 [00:03<00:00, 26.41it/s]

2025-04-20 10:48:33,800 - preprocessing_utils.mongo_utils - INFO - Batch 97: Moved 100/100 jobs. Total moved: 9700
2025-04-20 10:48:33,803 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 98: 100%|██████████| 100/100 [00:03<00:00, 26.05it/s]

2025-04-20 10:48:37,645 - preprocessing_utils.mongo_utils - INFO - Batch 98: Moved 100/100 jobs. Total moved: 9800
2025-04-20 10:48:37,648 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 99: 100%|██████████| 100/100 [00:03<00:00, 26.17it/s]

2025-04-20 10:48:41,472 - preprocessing_utils.mongo_utils - INFO - Batch 99: Moved 100/100 jobs. Total moved: 9900
2025-04-20 10:48:41,485 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 100: 100%|██████████| 100/100 [00:03<00:00, 25.08it/s]

2025-04-20 10:48:45,475 - preprocessing_utils.mongo_utils - INFO - Batch 100: Moved 100/100 jobs. Total moved: 10000
2025-04-20 10:48:45,479 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 101: 100%|██████████| 100/100 [00:04<00:00, 22.53it/s]

2025-04-20 10:48:49,920 - preprocessing_utils.mongo_utils - INFO - Batch 101: Moved 100/100 jobs. Total moved: 10100
2025-04-20 10:48:49,923 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 102: 100%|██████████| 100/100 [00:03<00:00, 25.94it/s]

2025-04-20 10:48:53,781 - preprocessing_utils.mongo_utils - INFO - Batch 102: Moved 100/100 jobs. Total moved: 10200
2025-04-20 10:48:53,784 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 103: 100%|██████████| 100/100 [00:04<00:00, 24.26it/s]

2025-04-20 10:48:57,907 - preprocessing_utils.mongo_utils - INFO - Batch 103: Moved 100/100 jobs. Total moved: 10300
2025-04-20 10:48:57,911 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 104: 100%|██████████| 100/100 [00:03<00:00, 25.67it/s]

2025-04-20 10:49:01,809 - preprocessing_utils.mongo_utils - INFO - Batch 104: Moved 100/100 jobs. Total moved: 10400
2025-04-20 10:49:01,812 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 105: 100%|██████████| 100/100 [00:04<00:00, 23.48it/s]

2025-04-20 10:49:06,075 - preprocessing_utils.mongo_utils - INFO - Batch 105: Moved 100/100 jobs. Total moved: 10500
2025-04-20 10:49:06,079 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 106: 100%|██████████| 100/100 [00:04<00:00, 24.66it/s]

2025-04-20 10:49:10,135 - preprocessing_utils.mongo_utils - INFO - Batch 106: Moved 100/100 jobs. Total moved: 10600
2025-04-20 10:49:10,138 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 107: 100%|██████████| 100/100 [00:03<00:00, 26.38it/s]

2025-04-20 10:49:13,931 - preprocessing_utils.mongo_utils - INFO - Batch 107: Moved 100/100 jobs. Total moved: 10700
2025-04-20 10:49:13,934 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 108: 100%|██████████| 100/100 [00:03<00:00, 25.87it/s]

2025-04-20 10:49:17,801 - preprocessing_utils.mongo_utils - INFO - Batch 108: Moved 100/100 jobs. Total moved: 10800
2025-04-20 10:49:17,804 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 109: 100%|██████████| 100/100 [00:03<00:00, 26.00it/s]

2025-04-20 10:49:21,653 - preprocessing_utils.mongo_utils - INFO - Batch 109: Moved 100/100 jobs. Total moved: 10900
2025-04-20 10:49:21,656 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 110: 100%|██████████| 100/100 [00:04<00:00, 24.60it/s]

2025-04-20 10:49:25,722 - preprocessing_utils.mongo_utils - INFO - Batch 110: Moved 100/100 jobs. Total moved: 11000
2025-04-20 10:49:25,725 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 111: 100%|██████████| 100/100 [00:04<00:00, 22.82it/s]

2025-04-20 10:49:30,110 - preprocessing_utils.mongo_utils - INFO - Batch 111: Moved 100/100 jobs. Total moved: 11100
2025-04-20 10:49:30,113 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 112: 100%|██████████| 100/100 [00:03<00:00, 26.22it/s]

2025-04-20 10:49:33,930 - preprocessing_utils.mongo_utils - INFO - Batch 112: Moved 100/100 jobs. Total moved: 11200
2025-04-20 10:49:33,933 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 113: 100%|██████████| 100/100 [00:04<00:00, 24.15it/s]

2025-04-20 10:49:38,076 - preprocessing_utils.mongo_utils - INFO - Batch 113: Moved 100/100 jobs. Total moved: 11300
2025-04-20 10:49:38,080 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 114: 100%|██████████| 100/100 [00:04<00:00, 23.70it/s]

2025-04-20 10:49:42,300 - preprocessing_utils.mongo_utils - INFO - Batch 114: Moved 100/100 jobs. Total moved: 11400
2025-04-20 10:49:42,303 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 115: 100%|██████████| 100/100 [00:04<00:00, 24.61it/s]

2025-04-20 10:49:46,368 - preprocessing_utils.mongo_utils - INFO - Batch 115: Moved 100/100 jobs. Total moved: 11500
2025-04-20 10:49:46,371 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 116: 100%|██████████| 100/100 [00:03<00:00, 25.79it/s]

2025-04-20 10:49:50,250 - preprocessing_utils.mongo_utils - INFO - Batch 116: Moved 100/100 jobs. Total moved: 11600
2025-04-20 10:49:50,253 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs



Batch 117: 100%|██████████| 100/100 [00:04<00:00, 24.59it/s]

2025-04-20 10:49:54,323 - preprocessing_utils.mongo_utils - INFO - Batch 117: Moved 100/100 jobs. Total moved: 11700





2025-04-20 10:49:55,672 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 67 jobs


Batch 118: 100%|██████████| 67/67 [00:02<00:00, 25.86it/s]

2025-04-20 10:49:58,265 - preprocessing_utils.mongo_utils - INFO - Batch 118: Moved 67/67 jobs. Total moved: 11767
2025-04-20 10:49:58,266 - preprocessing_utils.mongo_utils - INFO - No more jobs to fetch
2025-04-20 10:49:58,267 - preprocessing_utils.mongo_utils - INFO - Processing complete. Total jobs moved: 11767
2025-04-20 10:49:58,268 - preprocessing_utils.mongo_utils - INFO - MongoDB connection closed
2025-04-20 10:49:58,286 - preprocessing_utils.mongo_utils - INFO - MongoDB connections closed
Moved 11767 jobs





## Hard Skills Embeddings

### Load Embedding Model

In [6]:
from preprocessing_utils import Sentence2VecEncoder

In [7]:
hard_skills_encoder = Sentence2VecEncoder("../misc_data/all-MiniLM-L6-v2-finetuned/")
hard_skills_encoder.load_model()

2025-04-20 11:09:43,386 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cuda
2025-04-20 11:09:43,387 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: ../misc_data/all-MiniLM-L6-v2-finetuned/



2025-04-20 11:09:43,824 - preprocessing_utils.sentence2vec_utils - INFO - Successfully loaded model from ../misc_data/all-MiniLM-L6-v2-finetuned/


In [8]:
import os
from pymongo import MongoClient
import certifi

# Set MongoDB username and password
mongo_user = "cbradna"
mongo_password = "mongodb_cbradna0920"

# Construct the MongoDB connection string
mongo_uri = f"mongodb+srv://{mongo_user}:{mongo_password}@cluster0.zqzq6hs.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Connect to MongoDB
client = MongoClient(mongo_uri,
                     tls=True,
                     tlsCAFile=certifi.where())

# Test the connection by listing database names
try:
    print("Available collections:", client.list_database_names())
except Exception as e:
    print(f"An error occurred: {e}")
    
client.close()


Available collections: ['rl_jobsdb', 'sample_mflix', 'admin', 'local']


## Generate Hard Skill Embeddings

In [None]:
from preprocessing_utils import JobIterator
from tqdm import tqdm
from pymongo import MongoClient
import certifi

# Set MongoDB username and password
mongo_user = "cbradna"
mongo_password = "mongodb_cbradna0920"

# Construct the MongoDB connection string
mongo_uri = f"mongodb+srv://{mongo_user}:{mongo_password}@cluster0.zqzq6hs.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Database and collection names
db_name = "rl_jobsdb"
source_collection_name = "jobs_text"
target_collection_name = "job_embeddings"
batch_size = 100

# Field name for separate embeddings

# Initialize the JobIterator with the MongoDB URI
iterator = JobIterator(
    mongo_uri=mongo_uri,
    db_name=db_name,
    collection_name=source_collection_name,
    batch_size=batch_size
)

# Connect to MongoDB
client = MongoClient(mongo_uri,
                     tls=True,
                     tlsCAFile=certifi.where())

# Get the database and collections
db = client[db_name]
target_collection = db[target_collection_name]

# Iterate through the jobs and create embeddings
try:
    for batch in tqdm(iterator, desc="Batches"):
        for job in batch:
            job_id = job["_id"]
            technical_skills = job.get("technical_skills", [])
            tech_skill_embeddings = hard_skills_encoder.encode(technical_skills)
            if tech_skill_embeddings.size > 0:
                tech_skill_embeddings = np.average(tech_skill_embeddings, axis=0).tolist()


            # Create the new object for the job_embeddings collection
            embedding_object = {
                "original_job_id": job_id,
                "tech_skills_vectors": tech_skill_embeddings,
            }

            # Insert the new object into the job_embeddings collection
            try:
                target_collection.insert_one(embedding_object)
            except Exception as insert_error:
                print(f"Error inserting job {job_id}: {insert_error}")

    print("Job embeddings created and saved successfully.")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    iterator.close()
    client.close()

2025-04-20 18:31:52,338 - preprocessing_utils.mongo_utils - INFO - Initialized JobIterator with batch size 100


Batches: 0it [00:00, ?it/s]

2025-04-20 18:31:52,942 - preprocessing_utils.mongo_utils - INFO - Yielding batch of 100 jobs


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]


Error inserting job 67f2f276e93950df1517040f: cannot encode object: array([-1.45378262e-02, -2.25286814e-04, -1.52644338e-02,  3.08099156e-03,
       -2.89011057e-02, -1.76412761e-02,  4.84762862e-02,  1.64538510e-02,
       -4.33668168e-03, -1.82713326e-02, -4.83612530e-06, -4.73975297e-03,
        2.62079332e-02,  2.43065041e-02, -4.81811678e-03, -2.41735782e-02,
       -1.38036795e-02, -5.07508172e-03,  3.48291826e-03,  1.32191507e-02,
       -5.35851680e-02,  7.58652575e-04,  2.55818292e-02,  3.55250239e-02,
        8.72650556e-03, -2.08574086e-02,  4.23419774e-02,  4.12311815e-02,
       -2.15423796e-02, -3.62216122e-02,  3.56624685e-02, -5.91092091e-03,
        2.43025683e-02, -1.35281822e-02, -1.04509871e-02, -3.67500074e-02,
       -3.07399873e-02, -1.71884764e-02, -9.52748023e-03, -1.15138246e-02,
       -1.22025842e-03,  1.94874988e-03,  1.04091745e-02,  4.18880917e-02,
        5.61720226e-03, -2.72765011e-03, -5.18051907e-03,  3.32398862e-02,
        2.41336953e-02,  2.84543

Batches: 100%|██████████| 1/1 [00:00<00:00, 105.16it/s]


Error inserting job 67f2f276e93950df15170410: cannot encode object: array([-5.04721738e-02,  5.77012915e-03, -9.33216419e-03,  1.52056031e-02,
       -6.79192459e-03, -8.57997499e-03,  3.14377062e-02,  4.80135297e-03,
        1.69428382e-02,  1.99777093e-02, -2.23116414e-03,  2.78523806e-02,
        1.94383170e-02,  1.49728078e-02,  2.05292692e-03,  1.29679386e-02,
        9.19628143e-03,  1.61307061e-03, -2.14817766e-02,  8.68148170e-03,
        4.96439496e-03, -7.27933657e-04,  6.33769482e-03,  1.20935198e-02,
        3.36127207e-02,  4.31414694e-03,  1.27061214e-02,  3.58945988e-02,
        3.26725328e-03, -1.42891686e-02,  4.78003128e-03, -2.62796525e-02,
        2.47353911e-02, -1.19215632e-02,  1.76875461e-02, -9.10474453e-03,
        2.47010519e-03, -5.31232078e-03, -1.04057218e-03,  1.24719215e-03,
       -9.54688992e-03, -1.33124152e-02,  1.40161449e-02,  2.35888567e-02,
       -7.26544624e-03,  2.30093393e-03, -1.69450678e-02,  2.08788179e-02,
       -9.52804476e-05,  6.64195

Batches: 100%|██████████| 1/1 [00:00<00:00, 71.01it/s]


Error inserting job 67f2f276e93950df15170411: cannot encode object: array([-2.10070256e-02, -2.48441622e-02, -3.62421311e-02,  1.65763292e-02,
        1.93746295e-02,  2.16551051e-02,  4.10142727e-02,  1.75478682e-02,
       -2.43131332e-02, -3.68325189e-02,  4.28373087e-03, -8.17759719e-04,
        1.62814688e-02,  1.60339382e-02, -6.45126868e-03,  1.04161631e-02,
       -1.07312705e-02,  3.47804883e-03,  2.04046490e-03,  1.12312837e-02,
       -3.79014201e-02,  1.89819522e-02,  1.28635941e-02,  1.21639147e-02,
        2.79732589e-02, -1.86204258e-02,  1.97962634e-02,  1.24636460e-02,
       -2.76769809e-02, -1.40237752e-02, -7.27686891e-03,  5.68897948e-02,
       -8.32463242e-03,  5.01191244e-03, -6.18241308e-03, -3.04781850e-02,
       -2.73007900e-02, -1.61562394e-02,  1.58956759e-02,  6.33266848e-03,
       -3.17101702e-02, -1.62380598e-02,  2.79812701e-03,  3.79283614e-02,
        1.04489345e-02, -6.09857123e-03, -7.58621097e-03,  3.79172266e-02,
        1.88758858e-02,  2.52439

Batches: 100%|██████████| 1/1 [00:00<00:00, 73.83it/s]


Error inserting job 67f2f276e93950df15170412: cannot encode object: array([-2.96578389e-02, -1.84534602e-02, -2.70202328e-02,  1.94759816e-02,
        2.01789495e-02, -1.84243992e-02,  3.51138264e-02, -1.57867707e-02,
       -2.31462903e-03, -1.22820660e-02, -9.36369132e-03,  1.26972208e-02,
        2.12125778e-02, -1.15156341e-02, -6.04507886e-03,  7.94793665e-03,
       -2.03624126e-02,  1.12077603e-02,  7.61674671e-03,  1.20204091e-02,
       -1.47877196e-02,  1.73714943e-02,  2.20636353e-02, -7.10564945e-03,
        3.12056690e-02, -2.33449358e-02,  6.97788149e-02,  3.40404734e-02,
       -4.06365544e-02, -3.08808926e-02, -3.62132350e-03, -9.75372642e-03,
       -6.02111518e-02,  2.78051533e-02, -1.07130166e-02, -3.94014940e-02,
        1.54311638e-02, -7.69750401e-03,  2.19198596e-02, -1.96882766e-02,
        7.48972222e-03, -2.06595436e-02, -3.60734537e-02,  4.30883132e-02,
       -2.39182934e-02, -4.16841060e-02, -3.20700854e-02, -2.42856313e-02,
       -6.33448176e-03,  7.68357

Batches: 100%|██████████| 1/1 [00:00<00:00, 95.07it/s]


Error inserting job 67f2f276e93950df15170413: cannot encode object: array([-5.85277081e-02, -3.22271809e-02, -3.59985270e-02,  5.88639127e-03,
       -2.23764237e-02, -4.56967466e-02,  6.01106472e-02, -4.73883143e-03,
       -8.07847362e-03, -2.95504406e-02, -6.74571982e-03,  1.56907216e-02,
        4.83692944e-04,  1.96621697e-02, -3.79709527e-02,  3.11642513e-03,
       -2.03989688e-02,  1.69779640e-02, -1.24589279e-02,  8.16174690e-03,
       -4.64044623e-02,  8.50196928e-03,  1.48511184e-02,  4.07100245e-02,
        2.50785947e-02, -3.49910632e-02,  8.66898801e-03,  2.14414243e-02,
       -4.35605496e-02, -4.71862070e-02, -8.62800796e-03,  1.79656018e-02,
       -7.64085620e-04, -6.66765869e-03,  8.64539761e-03, -4.51178849e-02,
       -3.47394496e-02, -3.82404737e-02,  1.46741921e-03,  2.09297836e-02,
        2.27156579e-02, -2.02236120e-02,  8.65238532e-03,  1.54861808e-02,
       -6.05780780e-02, -3.11400671e-03,  8.85204133e-03,  1.48567334e-02,
        3.85531671e-02,  6.06997

Batches: 100%|██████████| 1/1 [00:00<00:00, 86.68it/s]


Error inserting job 67f2f276e93950df15170414: cannot encode object: array([-4.21636477e-02,  1.43520925e-02, -7.57055869e-03,  3.43581708e-03,
       -1.11062322e-02, -2.09882595e-02,  3.90240848e-02,  1.53665547e-03,
       -2.14441214e-04, -1.41730011e-02, -9.21517331e-03, -6.12849556e-03,
       -9.19380970e-03,  2.00253054e-02, -4.54684757e-02, -1.97114237e-03,
       -1.41712949e-02,  2.88021751e-04, -1.78414211e-02, -2.27792189e-04,
       -3.83122005e-02, -2.52851117e-02, -1.21569475e-02,  1.02093294e-02,
        3.07526533e-02, -1.04715321e-02, -9.52399056e-03,  1.33570889e-03,
       -2.99406648e-02, -1.65732168e-02, -2.22716611e-02, -1.79276019e-02,
        2.81882510e-02,  1.10069634e-02, -1.74411759e-02, -2.10729931e-02,
       -5.57305384e-03, -2.10252609e-02,  2.12736148e-03,  1.00476164e-02,
       -2.61445269e-02,  1.35055808e-02, -1.91504974e-03,  3.06105390e-02,
       -5.17404033e-03,  2.35264413e-02,  1.43854190e-02,  5.53130638e-03,
        1.51480292e-03,  2.30559

Batches: 100%|██████████| 1/1 [00:00<00:00, 117.50it/s]


Error inserting job 67f2f276e93950df15170415: cannot encode object: array([-1.85299981e-02, -5.59857674e-02, -5.56431413e-02,  2.25518644e-02,
       -3.35946083e-02, -3.90998907e-02,  6.90324157e-02,  5.96070662e-03,
       -1.04512125e-02, -3.76096666e-02, -2.47045625e-02,  1.34776374e-02,
       -9.01376363e-03,  2.86512729e-02, -3.25472429e-02, -4.92345504e-02,
       -3.34886052e-02,  6.06945483e-03,  2.27684155e-02,  4.27289158e-02,
       -2.58452576e-02,  3.63734015e-03,  3.26254554e-02,  3.50729711e-02,
        1.54871596e-02, -4.69545424e-02,  3.46068107e-02,  3.99361132e-03,
       -4.28666286e-02, -2.93746740e-02,  2.19945312e-02,  2.98871081e-02,
       -2.04827376e-02, -2.47559547e-02,  2.15496123e-02, -3.69765125e-02,
       -1.49781359e-02,  2.05385294e-02, -1.12720234e-02,  1.48002617e-02,
        1.56512167e-02, -2.51880828e-02,  2.32779980e-02,  3.57097387e-03,
       -2.86109317e-02, -2.36021858e-02,  2.86666881e-02,  9.78846941e-03,
        2.75729541e-02,  1.87485

Batches: 100%|██████████| 1/1 [00:00<00:00, 116.70it/s]


Error inserting job 67f2f276e93950df15170416: cannot encode object: array([-2.64334176e-02, -3.59342210e-02, -2.51278281e-02,  3.03556072e-03,
        5.30186389e-03, -4.61505726e-04,  2.69084238e-02, -8.70802812e-03,
       -3.10452301e-02, -2.51515862e-02, -3.58582214e-02,  1.18930377e-02,
       -9.50480253e-03,  1.96974184e-02, -7.46723497e-03, -9.00865905e-03,
       -1.39571298e-02, -4.64162137e-03,  1.18713640e-02,  6.12218166e-03,
       -3.17101106e-02,  1.17387073e-02, -2.40860763e-03,  5.56113720e-02,
        6.93524778e-02, -1.19040962e-02,  3.41170579e-02,  2.76566260e-02,
       -5.29502518e-02,  9.37057100e-03, -2.00054352e-03,  3.12815979e-02,
       -1.52849043e-02,  8.21354613e-03,  1.58976987e-02, -3.69653851e-02,
       -4.11841683e-02, -2.90454784e-03,  4.25395705e-02, -1.26017183e-02,
        7.59725273e-03, -3.20636779e-02,  4.56642695e-02,  4.36729565e-02,
        2.58894973e-02,  2.67024450e-02, -1.20361485e-02, -2.19710488e-02,
       -1.94115937e-02, -6.45304

Batches: 100%|██████████| 1/1 [00:00<00:00, 71.23it/s]


Error inserting job 67f2f276e93950df15170417: cannot encode object: array([-4.15202156e-02, -3.23949717e-02, -1.46847963e-02,  1.25958417e-02,
        2.11429242e-02,  1.37777533e-03,  5.04236519e-02, -9.27868485e-03,
       -1.32531933e-02, -3.74543443e-02,  8.96579307e-03,  1.43760676e-02,
       -1.47157330e-02,  3.04046497e-02, -2.48140655e-02, -7.25562638e-03,
       -2.53739692e-02,  4.28547151e-03, -3.57162543e-02,  3.91728990e-03,
       -3.30922231e-02,  1.86305344e-02,  7.80997565e-03,  4.07287814e-02,
        3.66369821e-02, -1.10710915e-02,  1.17440093e-02,  2.06973590e-03,
       -1.19343977e-02, -4.61331531e-02,  2.25926414e-02,  1.09270737e-02,
        1.11688236e-02, -1.03379069e-02, -1.49512757e-02, -2.74278522e-02,
       -3.20716016e-02, -1.44647155e-02, -1.73479621e-03,  8.80707987e-03,
       -1.48552330e-02, -2.71517951e-02,  1.24308402e-02,  1.80789828e-02,
       -1.90330409e-02,  7.22068315e-03, -6.47666631e-03,  6.36347756e-03,
        3.34126577e-02,  9.15513

Batches: 100%|██████████| 1/1 [00:00<00:00, 95.06it/s]


Error inserting job 67f2f276e93950df15170418: cannot encode object: array([-4.73796427e-02, -3.35409008e-02, -3.38179171e-02,  6.36400795e-03,
        2.36185938e-02, -4.16741893e-03,  8.49929675e-02,  2.36288253e-02,
       -7.62166968e-03, -3.65233831e-02,  2.95689721e-02,  1.48450462e-02,
        1.02737276e-02,  2.19047349e-02, -5.28714759e-03, -3.32371029e-03,
       -4.30925190e-03,  1.60041992e-02, -3.22114117e-02, -4.21379087e-03,
       -1.99261885e-02, -1.34843895e-02,  1.29095428e-02,  4.48402874e-02,
        6.70586526e-02, -4.72538359e-02,  1.82007067e-02,  4.11414243e-02,
       -2.57791858e-02, -2.51462478e-02,  2.41818633e-02,  5.50129823e-02,
        1.44051714e-02, -9.26842261e-03,  1.63504742e-02, -5.44074662e-02,
       -3.30764093e-02,  8.09428573e-04,  9.44237038e-03,  1.29785948e-02,
       -2.09718440e-02, -1.73456538e-02,  5.88181019e-02,  2.83092707e-02,
        4.53541667e-04, -2.63579772e-03, -1.12069910e-02,  3.29849385e-02,
        7.65433675e-03,  4.27211

Batches: 100%|██████████| 1/1 [00:00<00:00, 36.78it/s]


Error inserting job 67f2f276e93950df15170419: cannot encode object: array([-6.85840321e-04,  1.86613966e-02, -3.53076644e-02, -4.28468222e-03,
        1.13814604e-02,  1.56154046e-02,  1.38962204e-02,  1.03798890e-02,
       -2.24920604e-02,  1.97623260e-02, -1.66623257e-02, -1.53856883e-02,
        1.94550324e-02,  2.59339903e-02, -2.27280725e-02, -9.83378943e-03,
       -4.72072549e-02, -1.35901524e-02, -3.03573925e-02,  2.91959532e-02,
       -5.89740509e-03,  7.80292379e-04,  4.85875783e-03,  3.68691497e-02,
        1.48912882e-02,  4.31059040e-02,  1.56350583e-02,  1.57822799e-02,
       -1.33576766e-02,  1.78104229e-02, -4.39556577e-04, -8.05284362e-03,
        4.97899950e-02, -1.31548420e-02,  6.13183249e-03, -2.53182626e-03,
        4.54088626e-03,  5.83735155e-03, -8.43852758e-05,  2.79449299e-03,
       -1.66605972e-02, -1.12162065e-02,  1.65863093e-02,  3.63739841e-02,
        6.96251541e-03, -6.62547629e-03, -9.47461836e-03,  8.03963374e-03,
        9.94823035e-03, -3.19185

Batches: 100%|██████████| 1/1 [00:00<00:00, 73.87it/s]


Error inserting job 67f2f276e93950df1517041a: cannot encode object: array([-2.33536586e-02,  3.65662319e-03, -3.23123708e-02,  8.70958529e-03,
       -2.98657990e-03,  2.59516132e-03, -5.00515662e-03,  5.52386045e-03,
       -2.73741167e-02, -8.36365670e-03, -2.73377472e-03,  1.94477639e-03,
        1.09502366e-02,  1.04111824e-02, -2.20966320e-02,  6.34754030e-03,
       -9.16016847e-03,  9.20365378e-03, -1.37882344e-02, -1.38875041e-02,
       -3.72399762e-02,  9.33961011e-03,  1.06578907e-02,  2.71348306e-03,
        3.31141390e-02, -9.45855025e-03,  1.04700020e-02,  2.36337772e-03,
       -2.81424187e-02, -9.30492952e-03, -2.11223587e-02,  6.15892746e-03,
       -2.56107352e-03, -1.27300955e-02, -5.72661497e-03, -2.20741350e-02,
       -3.66713898e-03, -6.54493365e-03, -3.85410618e-03,  1.78822526e-03,
       -7.52240745e-03, -5.29974606e-03,  3.61302774e-03,  1.39771150e-02,
       -4.22349246e-03,  5.28948242e-03,  1.44063737e-02,  6.42720424e-03,
        8.67750775e-03,  7.84129

Batches: 100%|██████████| 1/1 [00:00<00:00, 110.71it/s]


Error inserting job 67f2f276e93950df1517041b: cannot encode object: array([ 6.85734861e-03, -1.52902938e-02,  2.45559569e-02,  3.34635563e-02,
       -5.86964339e-02,  4.69055511e-02,  1.68983750e-02, -8.64768960e-03,
        3.41768488e-02, -5.89754060e-03, -4.55007516e-02,  5.67836165e-02,
        2.08030287e-02,  1.93999428e-02, -4.07589450e-02,  6.05854020e-02,
        1.04234740e-03,  8.11103731e-04,  4.68706004e-02,  8.24020952e-02,
        1.07515603e-02, -4.10460755e-02, -4.70460951e-02,  5.44858761e-02,
        2.22626347e-02,  1.86594203e-03, -1.56066678e-02,  6.45460859e-02,
       -9.68765002e-03, -1.93488747e-02, -9.30488668e-03, -1.48811880e-02,
        5.13247624e-02,  1.96222998e-02,  3.47665921e-02, -6.21129870e-02,
        1.67116448e-02,  5.54328598e-02, -1.43935950e-02,  8.13502446e-03,
       -5.56541234e-02,  2.04484239e-02,  1.76009331e-02,  3.05517670e-02,
        1.93342529e-02, -1.77501775e-02, -9.52230953e-03,  1.01428293e-02,
        6.52131904e-03,  6.82014

Batches: 100%|██████████| 1/1 [00:00<00:00, 94.96it/s]


Error inserting job 67f2f276e93950df1517041c: cannot encode object: array([-2.78511327e-02, -4.01702709e-04, -5.84887248e-03,  1.30828395e-02,
        5.71703166e-03, -4.23317812e-02,  2.40779649e-02, -1.67492498e-02,
       -9.72226076e-03, -1.62009560e-02, -2.34083198e-02,  3.34327146e-02,
        1.85596617e-03,  4.75684367e-03, -3.59622687e-02,  3.73627469e-02,
       -1.90446489e-02,  1.73123498e-02, -2.61433842e-03,  1.99174471e-02,
       -8.91756173e-03,  1.68580422e-03,  4.06247471e-03,  2.67981309e-02,
        3.38849574e-02,  6.49460964e-03, -1.01805469e-02,  1.34994071e-02,
       -9.45524871e-03, -2.13799309e-02, -1.43852010e-02, -5.20288758e-03,
        1.98756326e-02,  1.98239507e-03,  2.12160964e-03, -1.23720700e-02,
       -2.89671868e-03, -5.84164960e-03,  5.18613309e-03,  2.53466852e-02,
        3.63745354e-03, -4.24935669e-02,  1.32131586e-02,  1.19663286e-03,
       -1.90722216e-02,  1.07302750e-02, -1.59480199e-02, -7.08372379e-03,
        2.31292360e-02,  1.33555

Batches: 100%|██████████| 1/1 [00:00<00:00, 82.58it/s]


Error inserting job 67f2f276e93950df1517041d: cannot encode object: array([ 9.8164771e-03,  8.8101462e-04,  2.5090199e-02,  4.3541320e-02,
       -3.3969849e-02,  4.3494530e-02, -2.8622808e-02,  1.0531108e-02,
        3.7885573e-02, -1.7057324e-02, -3.8258687e-02,  2.4658393e-02,
       -2.9893983e-02, -2.6043244e-02, -3.5673752e-02,  3.4743242e-02,
        6.0718793e-02,  1.2853925e-02,  2.0914068e-03,  7.9145595e-02,
        8.0234900e-02, -1.8528953e-02, -4.3802369e-02,  3.9577577e-04,
        5.8759812e-02,  7.4532176e-03, -4.2117648e-02,  4.2400457e-02,
       -1.4992505e-02,  5.4603792e-03, -3.5808716e-02, -1.9426392e-02,
        3.0451959e-02,  2.5843639e-02,  3.7092075e-02, -3.4199033e-02,
        1.3406174e-03,  8.5836994e-03,  5.3391987e-03,  4.9536400e-03,
       -4.1601066e-02,  3.1743016e-02,  2.5341842e-02, -1.7981360e-02,
        1.7780233e-02,  8.8431872e-05,  5.0897826e-02, -1.1289236e-02,
        9.3147578e-04,  1.0548939e-02,  1.6610874e-02, -7.7510528e-02,
       -6

Batches: 100%|██████████| 1/1 [00:00<00:00, 76.64it/s]
Batches: 0it [00:03, ?it/s]

2025-04-20 18:31:55,828 - preprocessing_utils.mongo_utils - INFO - MongoDB connection closed





KeyboardInterrupt: 

# Generating Candidate Profiles

## Candidate Bio

# Actually did this in Gemini web. I can't use 2.5 Pro API calls which are preferred for complex tax like these.