In [4]:
import requests
import json
import urllib.parse
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

JINA_API = os.getenv("JINA_API")

def google_jobs(query, location, date_posted, radius, num_pages):
    headers = {
        "Accept": "application/json",
        "Authorization": f"Bearer {JINA_API}",
        "X-Target-Selector": "div[jsname='Cpkphb'] div[jsname='bF1uUb']"  # Example CSS selector to target job listings
    }
    job_data = []
    base_url = "https://www.google.com/search?q={query}&ibp=htl;jobs&uule={location}&hl=en&gl=us&chips=date_posted:{date_posted}&lrad={radius}&start={start}"
    jina_url_prefix = "https://r.jina.ai/"

    for page in range(num_pages):
        start = page * 10
        url = base_url.format(
            query=urllib.parse.quote(query),
            location=urllib.parse.quote(location),
            date_posted=date_posted,
            radius=radius,
            start=start
        )
        jina_url = jina_url_prefix + url
        print(f"{jina_url}")
        response = requests.get(jina_url, headers=headers)
        
        if response.status_code != 200:
            print(f"Failed to fetch page {page + 1}: {response.status_code}")
            continue
        
        json_response = response.json()
        job_data.extend(json_response.get('results', []))
                
    return job_data

# Example usage:
query = "Data Scientist"
location = "w+CAIQICINVW5pdGVkIFN0YXRlcw"  # Encoded location for United States
date_posted = "3days"  # Jobs posted in the last 3 days
radius = 6000  # Search within a 6000 mile radius
num_pages = 1  # Total number of pages to scrape

job_results = google_jobs(query, location, date_posted, radius, num_pages)

# Save the job results to a JSON file
with open('google_job_results.json', 'w') as f:
    json.dump(job_results, f, indent=4)

print(f"Job results have been saved to google_job_results.json. Total results: {len(job_results)}")


https://r.jina.ai/https://www.google.com/search?q=Data%20Scientist&ibp=htl;jobs&uule=w%2BCAIQICINVW5pdGVkIFN0YXRlcw&hl=en&gl=us&chips=date_posted:3days&lrad=6000&start=0
Job results have been saved to google_job_results.json. Total results: 0


In [5]:
import json

# Load the JSON data from the file
with open('search.json', 'r') as file:
    data = json.load(file)

# Function to extract relevant details from job data
def extract_job_details(job):
    return {
        'title': job.get('title'),
        'company_name': job.get('company_name'),
        'location': job.get('location'),
        'via': job.get('via'),
        'description': job.get('description'),
        'job_highlights': job.get('job_highlights'),
        'related_links': job.get('related_links'),
        'thumbnail': job.get('thumbnail'),
        'extensions': job.get('extensions'),
        'detected_extensions': job.get('detected_extensions')
    }

# Extracting job details from the JSON data
job_listings = [extract_job_details(job) for job in data.get('jobs_results', [])]

# Display extracted job listings
for i, job in enumerate(job_listings, start=1):
    print(f"Job {i}:")
    print(f"Title: {job['title']}")
    print(f"Company: {job['company_name']}")
    print(f"Location: {job['location']}")
    print(f"Via: {job['via']}")
    print(f"Description: {job['description']}")
    print(f"Job Highlights: {job['job_highlights']}")
    print(f"Related Links: {job['related_links']}")
    print(f"Thumbnail: {job['thumbnail']}")
    print(f"Extensions: {job['extensions']}")
    print(f"Detected Extensions: {job['detected_extensions']}")
    print("=" * 40)

# Save the processed job listings to a new JSON file if needed
with open('processed_jobs.json', 'w') as f:
    json.dump(job_listings, f, indent=4)

print(f"Processed job data has been saved to processed_jobs.json. Total jobs: {len(job_listings)}")


Job 1:
Title: Lead Data Scientist | Columbia, SC, USA | Remote
Company: S&P Global
Location:  Anywhere 
Via: via EFinancialCareers
Description: Lead Data Scientist

About the Role...
Grade Level (for internal use):
11 About the Role:

The Team: The data science team is responsible for extracting insights and knowledge from complex datasets to create new products and solve strategic business problems. What sets this team apart is its emphasis on empirical evidence and continuous learning, valuing a balance between technical rigor, creativity in problem-solving, and a strong commitment to producing tangible value through data-driven solutions. Currently, the team is focusing on the implementation of a few GenAI-powered enterprise-scale projects.

The Responsibilities & Impact: A lead data scientist significantly contributes to the business by leading advanced GenAI-powered projects and mentoring junior team members. This work positively impacts clients by delivering more accurate predict

In [12]:
import json
from transformers import AutoTokenizer, TFAutoModelForTokenClassification, pipeline
import numpy as np

# Load the JSON data from the file
with open('search.json', 'r') as file:
    data = json.load(file)

# Load a more suitable model for technical domains
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForTokenClassification.from_pretrained(model_name)

# Initialize the NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def extract_entities(description):
    return ner_pipeline(description)

# Convert non-serializable objects to serializable
def serialize(obj):
    if isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

# Extract entities from each job description
for job in data['jobs_results']:
    description = job.get('description', '')
    job['entities'] = extract_entities(description)
    print(f"\nExtracted entities for job: {job['title']}")
    for entity in job['entities']:
        print(f"Entity: {entity['word']} ({entity['entity_group']})")

# Save the updated job data with extracted entities
# with open('job_entities.json', 'w') as file:
#     json.dump(data, file, indent=4, default=serialize)

print("Entity extraction completed and saved to job_entities.json")


All PyTorch model weights were used when initializing TFBertForTokenClassification.

All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.



Extracted entities for job: Lead Data Scientist | Columbia, SC, USA | Remote
Entity: GenA (MISC)
Entity: GenAI (MISC)
Entity: AI (MISC)
Entity: LLM (MISC)
Entity: LLM (MISC)
Entity: Gene (MISC)
Entity: AI (MISC)
Entity: S & P Global (ORG)

Extracted entities for job: Sr Data Scientist
Entity: Citizens ’ Enterprise Data & Analytics (ORG)
Entity: ED & A (ORG)
Entity: Scientist (ORG)
Entity: AI (MISC)
Entity: AI (MISC)
Entity: AI (MISC)
Entity: Python (MISC)
Entity: SQL (MISC)
Entity: Amazon (ORG)
Entity: Sagemaker (MISC)

Extracted entities for job: Data Scientist
Entity: O (PER)
Entity: I (ORG)
Entity: R (ORG)
Entity: Federal (ORG)
Entity: Treasury (ORG)
Entity: IRS (ORG)
Entity: I (ORG)
Entity: RPP (ORG)
Entity: , Applied Analytics and Statistics Data Management (ORG)
Entity: RAAS (ORG)

Extracted entities for job: Chief Data Scientist - Generative AI
Entity: U. S. (LOC)
Entity: P (ORG)
Entity: ##NNL (ORG)
Entity: PNNL (ORG)
Entity: P (ORG)
Entity: ##NNL (ORG)
Entity: National Securit

In [2]:
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException

# Initialize Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-extensions")

# Set custom headers
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

def get_job_ids(url, num_pages):
    job_ids = []
    driver.get(url)
    time.sleep(3)  # Allow time for the page to load

    for page in range(num_pages):
        print(f"Collecting job IDs from page {page + 1}")

        # Find all job elements in the left pane
        job_list_section = driver.find_element(By.CSS_SELECTOR, "section.two-pane-serp-page__results-list ul.jobs-search__results-list")
        job_elements = job_list_section.find_elements(By.CLASS_NAME, "base-card")

        for job_element in job_elements:
            try:
                # Extract the job ID from the data-entity-urn attribute
                job_id = job_element.get_attribute("data-entity-urn").split(":")[-1]
                job_ids.append(job_id)
            except NoSuchElementException as e:
                print(f"Error collecting job ID: {e}")
            except AttributeError as e:
                print(f"Error parsing job ID: {e}")

        # Click the "See more jobs" button to load more jobs
        try:
            see_more_button = driver.find_element(By.XPATH, "//button[contains(@aria-label, 'See more jobs')]")
            driver.execute_script("arguments[0].click();", see_more_button)
            time.sleep(3)  # Allow time for more jobs to load
        except NoSuchElementException as e:
            print(f"Error loading more jobs: {e}")
            break
    
    return job_ids

def scrape_job_details(base_url, job_ids):
    job_data = []

    for job_id in job_ids:
        driver.get(f"{base_url}&currentJobId={job_id}")
        print(f"{base_url}&currentJobId={job_id}")
        time.sleep(2)  # Allow time for the job description to load

        try:
            # Locate the job details section correctly
            job_detail_section = driver.find_element(By.CLASS_NAME, "base-serp-page__content")

            title_element = job_detail_section.find_element(By.CSS_SELECTOR, "h2.top-card-layout__title")
            company_element = job_detail_section.find_element(By.CSS_SELECTOR, "a.topcard__org-name-link")
            location_element = job_detail_section.find_element(By.CSS_SELECTOR, "span.topcard__flavor--bullet")
            description_element = job_detail_section.find_element(By.CLASS_NAME, "description__text--rich")
            description = description_element.get_attribute("innerHTML")

            try:
                salary_element = job_detail_section.find_element(By.CLASS_NAME, "salary compensation__salary")
                salary = salary_element.text
            except NoSuchElementException:
                salary = "Not listed"

            job_data.append({
                'title': title_element.text,
                'company': company_element.text,
                'location': location_element.text,
                'description': description,
                'salary': salary,
                'job_id': job_id
            })
        except NoSuchElementException as e:
            print(f"Error scraping job details: {e}")
        except Exception as e:
            print(f"Unexpected error: {e}")

    return job_data

# Example usage
url = "https://www.linkedin.com/jobs/search/?keywords=Data%20Scientist&location=United%20States&geoId=103644278&position=1&pageNum=0&source=post_page-----4988c7da87ee--------------------------------"
num_pages = 1

# Get job IDs
job_ids = get_job_ids(url, num_pages)

# Scrape job details
jobs = scrape_job_details(url, job_ids)

# Save the job data to a JSON file
with open('linkedin_jobs.json', 'w') as file:
    json.dump(jobs, file, indent=4)

print(f"Scraped job data has been saved to linkedin_jobs.json. Total jobs: {len(jobs)}")

driver.quit()


Collecting job IDs from page 1
https://www.linkedin.com/jobs/search/?keywords=Data%20Scientist&location=United%20States&geoId=103644278&position=1&pageNum=0&source=post_page-----4988c7da87ee--------------------------------&currentJobId=3966714827
https://www.linkedin.com/jobs/search/?keywords=Data%20Scientist&location=United%20States&geoId=103644278&position=1&pageNum=0&source=post_page-----4988c7da87ee--------------------------------&currentJobId=3959049355
Error scraping job details: Message: no such element: Unable to locate element: {"method":"css selector","selector":".base-serp-page__content"}
  (Session info: chrome-headless-shell=126.0.6478.127); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x006FC1C3+27395]
	(No symbol) [0x00693DC4]
	(No symbol) [0x00591B7F]
	(No symbol) [0x005D2C65]
	(No symbol) [0x005D2D3B]
	(No symbol) [0x0060EC82]
	(No symbol)

: 