<a href="https://colab.research.google.com/github/chikilivighneshshastry/colab_files/blob/main/jobright_data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# extract fingle page info from jonright


In [None]:
import aiohttp
import asyncio

async def extract_job_data(url):
  async with aiohttp.ClientSession() as session:
    response = await session.get(url)
    html = await response.text()
    print(html)
    return response

url = 'https://jobright.ai/jobs/info/685a54b5be2d7e56476268d'
# url = 'https://jobright.ai/jobs/info'
response = await extract_job_data(url)

In [None]:
response.status
html_data = await response.text()
print(html_data)

In [None]:
# prompt: parse html_data using bs4 and get with text in a id

from bs4 import BeautifulSoup
import json
soup = BeautifulSoup(html_data, 'html.parser')

# Assuming the text you want is within an element with a specific ID,
# replace 'your_element_id' with the actual ID of the element.
element_with_id = soup.find(id='__NEXT_DATA__')

if element_with_id:
  detailed_json_data = element_with_id.get_text()
  print(detailed_json_data)
else:
  print("Element with the specified ID not found.")

data = json.loads(detailed_json_data)
print(data)


In [None]:
data.keys()
print(data['props'].keys())
print(data['page'])
print(data['query'])
print(data['buildId'])
print(data['isFallback'])
print(data['gssp'])
print(data['scriptLoader'])

In [None]:
print(data['props']['pageProps']['baseSalary'])
print(data['props']['pageProps']['jobLocation'])
print(data['props']['pageProps']['logined'])
print(data['props']['pageProps']['jobHashedId'])
print(data['props']['pageProps']['_sentryTraceData'])
print(data['props']['pageProps']['_sentryBaggage'])

In [None]:
data['props']['pageProps']['dataSource']

In [None]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import random

# --- Configuration ---
TARGET_SITE_DOMAIN = "example.com" # To keep crawling within the site
INITIAL_SEED_URL = f"http://{TARGET_SITE_DOMAIN}"

PROXY_LIST = [
    "http://198.46.172.102:12345",
    "http://103.85.103.1:5678",
    # ... more proxies
]

# --- Database (Conceptual - replace with actual DB interaction) ---
# In a real scenario, use libraries like psycopg2 (PostgreSQL), mysql.connector, sqlite3, or an ORM like SQLAlchemy
DATABASE_URLS_SEEN = set() # Simple in-memory set for this example; use a real DB!

async def db_url_exists(url):
    # Simulate DB check
    return url in DATABASE_URLS_SEEN

async def db_add_url(url):
    # Simulate DB add
    DATABASE_URLS_SEEN.add(url)
    print(f"[DB] Added: {url}")

# --- Crawler Components ---
url_frontier = asyncio.Queue()
processed_urls_count = 0
MAX_URLS_TO_CRAWL = 100 # Example limit

async def fetch(session, url, proxy):
    try:
        print(f"[FETCHING] {url} via proxy {proxy if proxy else 'DIRECT'}")
        async with session.get(url, proxy=proxy, timeout=10, ssl=False) as response: # Added ssl=False for potential local SSL issues
            if response.status == 200:
                return await response.text()
            else:
                print(f"[ERROR] HTTP {response.status} for {url}")
                return None
    except Exception as e:
        print(f"[ERROR] Failed to fetch {url}: {e}")
        return None

def get_proxy():
    if PROXY_LIST:
        return random.choice(PROXY_LIST)
    return None

def parse_and_extract_links(html_content, base_url):
    links = set()
    if not html_content:
        return links
    soup = BeautifulSoup(html_content, 'lxml') # 'html.parser' is a built-in alternative
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        # Join relative URLs with the base URL
        full_url = urljoin(base_url, href)
        # Basic clean-up (remove fragment, normalize)
        parsed_url = urlparse(full_url)
        normalized_url = parsed_url._replace(fragment="").geturl()

        # Filter: Only crawl URLs from the target domain
        if urlparse(normalized_url).netloc == TARGET_SITE_DOMAIN:
            links.add(normalized_url)
    return links

async def worker(name, session):
    global processed_urls_count
    while True:
        try:
            current_url = await url_frontier.get()
            print(f"[{name}] Processing: {current_url}")

            if await db_url_exists(current_url):
                print(f"[{name}] Already processed/in DB: {current_url}")
                url_frontier.task_done()
                continue

            await db_add_url(current_url) # Add to DB before fetching (or mark as being processed)

            # TODO: Implement robots.txt check here

            proxy = get_proxy()
            html = await fetch(session, current_url, proxy)

            if html:
                new_links = parse_and_extract_links(html, current_url)
                for link in new_links:
                    if not await db_url_exists(link) and processed_urls_count < MAX_URLS_TO_CRAWL :
                        # Check DB again before adding to frontier to handle race conditions if multiple workers find same link
                        if link not in DATABASE_URLS_SEEN: # Simplified check; real DB would handle uniqueness
                            await url_frontier.put(link)
                            print(f"[{name}] Queued new link: {link}")


                processed_urls_count += 1
                if processed_urls_count >= MAX_URLS_TO_CRAWL:
                    print(f"[{name}] Reached max URL limit. Draining queue...")
                    # Allow other tasks to finish current work, then stop adding new ones.
                    # Or more abruptly, cancel other tasks.

            url_frontier.task_done()

            if processed_urls_count >= MAX_URLS_TO_CRAWL and url_frontier.empty():
                break # Exit worker if limit reached and queue is empty

            await asyncio.sleep(1) # Be respectful: add a small delay

        except Exception as e:
            print(f"[{name}] Error in worker: {e}")
            url_frontier.task_done() # Ensure task_done is called even on error
            continue # Continue to next URL

async def main():
    await url_frontier.put(INITIAL_SEED_URL)
    await db_add_url(INITIAL_SEED_URL) # Add seed to DB initially

    # You might want a ClientSession per proxy type or a more sophisticated setup
    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: # ssl=False for local dev; use proper SSL context in prod
        # Create worker tasks
        num_workers = 5 # Number of concurrent crawlers
        tasks = []
        for i in range(num_workers):
            task = asyncio.create_task(worker(f"Worker-{i+1}", session))
            tasks.append(task)

        # Wait for the queue to be processed or limit to be reached
        await url_frontier.join() # Waits until all items in queue are gotten and processed

        # If max URLs reached, there might still be items in the queue
        # or workers might be processing. We need a way to signal them to stop gracefully.
        # For simplicity here, we cancel tasks if max_urls is hit and queue is effectively drained by workers.
        if processed_urls_count >= MAX_URLS_TO_CRAWL:
            print("Max URL limit reached. Cancelling worker tasks...")

        for task in tasks:
            task.cancel() # Cancel all worker tasks

        await asyncio.gather(*tasks, return_exceptions=True) # Wait for tasks to be cancelled

    print("Crawling finished.")
    print(f"Total unique URLs seen (from in-memory set): {len(DATABASE_URLS_SEEN)}")

if __name__ == "__main__":
    asyncio.run(main())

In [None]:
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy

strategy = BFSDeepCrawlStrategy(
    max_depth=2,               # Crawl initial page + 2 levels deep
    include_external=False,    # Stay within the same domain
    max_pages=50,              # Maximum number of pages to crawl (optional)
    score_threshold=0.3,       # Minimum score for URLs to be crawled (optional)
)

In [None]:
!pip install crawl4ai

In [None]:
!crawl4ai-setup

In [None]:
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.deep_crawling.filters import (
    FilterChain,
    DomainFilter,
    URLPatternFilter,
    ContentTypeFilter
)
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer

async def run_advanced_crawler():
    # Create a sophisticated filter chain
    filter_chain = FilterChain([
        # Domain boundaries
        DomainFilter(
            allowed_domains=["jobright.ai"]
            # blocked_domains=["old.docs.example.com"]
        ),

        # URL patterns to include
        # URLPatternFilter(patterns=["*guide*", "*tutorial*", "*blog*"]),

        # Content type filtering
        ContentTypeFilter(allowed_types=["text/html"])
    ])


    # Set up the configuration
    config = CrawlerRunConfig(
        deep_crawl_strategy =BFSDeepCrawlStrategy()


        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=2,
            include_external=False,
            filter_chain=filter_chain
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        stream=True,
        verbose=True
    )

    # Execute the crawl
    results = []
    async with AsyncWebCrawler() as crawler:
        async for result in await crawler.arun("https://jobright.ai/jobs/info", config=config):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")

    # Analyze the results
    print(f"Crawled {len(results)} high-value pages")
    print(f"Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}")

    # Group by depth
    depth_counts = {}
    for result in results:
        depth = result.metadata.get("depth", 0)
        depth_counts[depth] = depth_counts.get(depth, 0) + 1

    print("Pages crawled by depth:")
    for depth, count in sorted(depth_counts.items()):
        print(f"  Depth {depth}: {count} pages")
    return results
if __name__ == "__main__":
    results = await run_advanced_crawler()


In [None]:
base_url =''
end_word_categorys = []
parmeters = []
result_urls = []
for name in end_categorys:
  for para in parmeters:
    url = prepare_url(base_url,name,para)
    result_urls = get_all_jobpost_urls(url)
    save_to_db(result_urls)



In [None]:
!pip install scrapy

In [None]:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings # Useful for default settings

# This is often needed if running in Jupyter/IPython to avoid "ReactorNotRestartable"
import nest_asyncio
nest_asyncio.apply()

# --- Your Spider Definition ---
class AllLinksSpider(CrawlSpider):
    name = 'all_links_scraper_cell' # Changed name slightly to avoid clashes if you have the other one
    allowed_domains = ['jobright.ai']
    # start_urls = ['https://jobright.ai/jobs/backenddeveloper/']
    start_urls = ['https://jobright.ai/jobs/backenddeveloper'] # More general starting point
    # Counter for processed URLs or items
    processed_url_count = 0
    MAX_URLS_TO_PROCESS = 100
    # Custom settings for this spider if needed (can also be passed to CrawlerProcess)
    custom_settings = {
        'LOG_LEVEL': 'INFO', # 'DEBUG' for more verbosity
        'DOWNLOAD_DELAY': 0.25,  # Be respectful
        'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
        # 'DEPTH_LIMIT': 2 # Uncomment to limit crawl depth
    }

    rules = (
        Rule(
            LinkExtractor(
                allow_domains=['jobright.ai'],
                deny=(
                    r'/login', r'/register', r'/password', # Example patterns to avoid
                    r'mailto:', r'tel:', # Avoid mail and tel links
                )
            ),
            callback='parse_page_links',
            follow=True
        ),
    )

    def __init__(self, *args, **kwargs):
        super(AllLinksSpider, self).__init__(*args, **kwargs)
        # Get the list passed from the CrawlerProcess or default to an empty list
        self.collected_links_list = kwargs.get('output_list', [])

    def parse_page_links(self, response):
        self.logger.info(f"Processing page: {response.url}")
        links_on_this_page = response.css('a::attr(href)').getall()

        for link_href in links_on_this_page:
            absolute_link = response.urljoin(link_href)
            # Check if it's within allowed domains again, just to be safe if LinkExtractor somehow missed
            if self.allowed_domains and any(domain in scrapy.utils.url.get_domain(absolute_link) for domain in self.allowed_domains):
                link_data = {
                    'source_page_url': response.url,
                    'extracted_link': absolute_link
                }
                # Append to the list provided during initialization
                self.collected_links_list.append(link_data)
                # Still yield if you want to use Scrapy's feed exporters or other pipelines
                yield link_data

In [None]:
# This list will be populated by the spider
scraped_links_data = []

# --- Configure and Run the Crawler ---
# Get project settings if you have a settings.py, otherwise, it provides defaults
settings = get_project_settings()

# Override or add settings
settings.set('USER_AGENT', 'MyCustomBot/1.0 (+http://mywebsite.com/botinfo)')
# If you want to output to a file as well using Scrapy's feed exporters:
# settings.set('FEEDS', {
#     'output_links.json': {'format': 'json', 'overwrite': True},
# })

# Create a CrawlerProcess
# The 'settings' argument can be a Settings object or a dictionary
process = CrawlerProcess(settings=settings)

# Pass the list to the spider instance when scheduling it
# The spider's __init__ will pick up 'output_list' from kwargs
process.crawl(AllLinksSpider, output_list=scraped_links_data)

# The script will block here until all crawling is finished
print("Starting Scrapy process...")
process.start()
print("Scrapy process finished.")

# --- Now print the collected links ---
print(f"\n--- Collected {len(scraped_links_data)} link entries: ---")
unique_extracted_links = set()
for item in scraped_links_data:
    print(f"From: {item['source_page_url']} -> Found: {item['extracted_link']}")
    unique_extracted_links.add(item['extracted_link'])

print(f"\n--- {len(unique_extracted_links)} Unique Extracted Links: ---")
for link in sorted(list(unique_extracted_links)): # Print sorted unique links
    print(link)

# If you want just a flat list of the unique extracted URLs:
final_unique_links_list = sorted(list(unique_extracted_links))
# print("\nFinal flat list of unique links:")
# print(final_unique_links_list)

# extract jobposts from jobright through api like url

In [None]:
!pip install nest_asyncio

In [12]:
category = {
  "job_category": {
    "Software & IT": [
      "Backend Engineer",
      "Java Engineer",
      "Python Engineer",
      ".Net Engineer",
      "C/C++ Engineer",
      "Golang Engineer",
      "Full Stack Engineer",
      "Blockchain Engineer",
      "Salesforce Developer",
      "Frontend Software Engineer",
      "React Developer",
      "UI/UX Developer",
      "iOS/Swift Developer",
      "Android Developer",
      "Flutter Developer",
      "Unity Developer",
      "Unreal Engine Developer",
      "AR/VR Developer",
      "Game Developer",
      "Software Testing/Quality Assurance Engineer",
      "Automation Test Engineer",
      "QA Manager",
      "Network Security Engineer",
      "Cloud Security Engineer",
      "Cyber Security Analyst",
      "Cyber Security Engineer",
      "Network Engineer",
      "Systems Engineer",
      "Site Reliability Engineer (SRE)",
      "DevOps",
      "SoC Analyst",
      "IT Support Specialist",
      "Help Desk Technician/Desktop Support Technician",
      "System Administrator",
      "Network Support Specialist",
      "Salesforce Administrator",
      "Database Administrator",
      "Machine Learning Engineer",
      "AI Engineer",
      "LLM Engineer",
      "Machine Learning/AI Researcher",
      "Machine Learning, Deep Learning",
      "Machine Learning, Model Training and Inference",
      "Machine Learning, Search System",
      "Machine Learning, Ads",
      "Machine Learning, Operations (ML Ops)",
      "Machine Learning, Infrastructure",
      "Machine Learning, Computer Vision",
      "Data Annotation/AI Tutor",
      "Sales Engineer",
      "Developer Relations",
      "Solutions Architect",
      "Technical Writing",
      "Data Analyst",
      "Data Scientist",
      "Data Engineer",
      "ETL Developer",
      "Data Warehouse Engineer",
      "Business/BI Analyst",
      "Power BI Developer",
      "Engineering Manager",
      "Software Architect",
      "Engineering Director/VP",
      "CTO",
      "Project/Program Manager",
      "Technical Project Manager",
      "Scrum Master"
    ],
    "Hardware & Electrical Engineering": [
      "Electronics Engineer",
      "Hardware Engineer",
      "Embedded Software Engineer",
      "ASIC Engineer",
      "FPGA Engineer",
      "RF (Radio Frequency) Engineer",
      "PCB Engineer",
      "Systems Integration Engineer",
      "IC Design Engineer",
      "Digital IC Verification Engineer",
      "Analog IC Design Engineer",
      "Electrical Engineer",
      "Automation Engineer",
      "Electromechanical Engineer",
      "Robotics Engineer",
      "Controls Engineer",
      "Electrical Test Engineer",
      "Hardware Test Engineer",
      "Project/Program Manager",
      "Telecommunications Engineer",
      "Network Engineer",
      "Wireless/Antenna Engineer",
      "Battery Engineer",
      "Motor Engineer",
      "Aerospace Engineer",
      "Sales Engineer",
      "Solutions Architect"
    ],
    "Mechanical & Industrial Engineering": [
      "Mechanical Engineer",
      "Manufacturing Engineer",
      "Process Engineer",
      "Industrial Engineer",
      "Mechatronics Engineer",
      "Operations Manager/Director",
      "Safety Engineer",
      "Chemical Engineer",
      "Laboratory Technician",
      "Automotive Engineer",
      "Powertrain Engineer",
      "Autonomous Driving System Engineer",
      "Quality Assurance Specialist",
      "EHS (Environment, Health, Safety) Engineer",
      "Project/Program Manager"
    ],
    "Product Management": [
      "Product Analyst",
      "Product Manager",
      "Technical Product Manager",
      "Product Manager, Consumer Software",
      "Product Manager, B2B/SaaS",
      "Product Manager, Hardware/Robotics/IoT",
      "AI Product Manager",
      "Game Designer"
    ],
    "Customer Service & Success": [
      "Customer Service Representative",
      "Customer Service Manager",
      "Customer Support",
      "Customer Success"
    ],
    "Sales": [
      "Sales Development Representative",
      "Inside Sales Representative",
      "Account Executive, SMB",
      "Field Sales Representative",
      "Enterprise Sales",
      "Channel Sales",
      "Business Development",
      "Partnership",
      "Sales Manager",
      "Regional Sales Manager",
      "Sales Director/VP",
      "Automotive Sales",
      "Real Estate Sales",
      "Leasing Manager",
      "Retail Sales",
      "Store Manager",
      "Medical Sales",
      "Medical Device Sales",
      "Financial Advisor",
      "Insurance Sales",
      "Sales Support",
      "Sales Operations Specialist"
    ],
    "HR, Admin & Legal": [
      "Human Resource Specialist",
      "Recruiter/Sourcer",
      "Recruiting Coordinator",
      "Payroll Specialist",
      "HR Business Partner",
      "Human Resource Manager/Director",
      "Administrative Assistant",
      "Executive Assistant",
      "Chief of Staff",
      "Office Manager",
      "Receptionist",
      "Data Entry Clerk",
      "Corporate Counsel",
      "Paralegal",
      "Legal Assistant",
      "Litigation Lawyer",
      "Intellectual Property Lawyer",
      "Criminal Lawyer",
      "Family Lawyer",
      "Immigration Lawyer",
      "Compliance Specialist",
      "Risk Analyst",
      "Court Clerk",
      "Case Manager",
      "Legal Operations Manager"
    ],
    "Finance & Accounting": [
      "Accountant",
      "Controller",
      "Tax Specialist",
      "Auditor",
      "Corporate Finance Analyst",
      "Treasury",
      "Financial Analyst",
      "Risk Analyst",
      "Securities Trader",
      "Quantitative Analyst/Researcher",
      "Investment Manager",
      "Equity Analyst",
      "Asset Manager",
      "Portfolio Manager",
      "Commercial Banker",
      "Investment Banker",
      "Credit Analyst",
      "Loan Officer",
      "Investment Analyst/Associate",
      "Investment Direct/VP",
      "Investment Partner",
      "Portfolio Operations Manager",
      "Fundraising Manager",
      "Investor Relations Manager",
      "Actuary",
      "Underwriter"
    ],
    "Design & Creative": [
      "Graphic Designer",
      "UI Designer",
      "UX Designer",
      "UX Researcher",
      "3D Designer",
      "Animator",
      "Illustrator",
      "Video Editor",
      "Creative/Art Director",
      "Motion Designer",
      "Interior Designer",
      "Landscape Designer",
      "Industrial Designer"
    ],
    "Real Estate & Construction": [
      "Leasing Consultant",
      "Property Manager",
      "Architect",
      "Landscape Architect",
      "Urban Planner",
      "Construction Project Manager",
      "Civil Engineer",
      "Structural Engineer"
    ],
    "Marketing & Communications": [
      "Content Marketing/Strategy",
      "SEO",
      "Social Media Management",
      "Copywriter",
      "Product Marketing",
      "Brand Manager",
      "Public Relations",
      "Community Manager",
      "Event Marketing Specialist",
      "Growth Marketing",
      "Advertising Specialist",
      "Performance Marketing",
      "Lifecycle Marketing",
      "Email Marketing"
    ],
    "Supply Chain & Operations": [
      "Supply Chain Manager",
      "Inventory Manager",
      "Logistics Manager",
      "Warehouse Manager",
      "Distribution Center Manager",
      "Procurement Manager",
      "Facilities Manager"
    ],
    "Consulting": [
      "IT Consultant",
      "Business Analyst",
      "Data Consultant",
      "Cyber Security Consultant",
      "Business Strategy Consultant",
      "Market Research Analyst",
      "Change Management Consultant",
      "Operations Consultant",
      "Financial Consultant",
      "Risk Management Consultant",
      "Mergers & Acquisitions (M&A) Consultant"
    ],
    "Energy & Environmental": [
      "Energy Engineer",
      "Renewable Energy Engineer",
      "Nuclear Engineer",
      "Power Systems Engineer",
      "Environmental Engineer",
      "Environmental Scientist"
    ],
    "Education & Training": [
      "K-12 Teaching",
      "Higher Education Teaching",
      "Corporate Training and Development",
      "Educational Administration",
      "Academic Dean"
    ],
    "Healthcare & Life Sciences": [
      "Healthcare Data Analyst",
      "Healthcare Data Scientist",
      "Healthcare IT Specialist",
      "EHR (Electronic Health Records) System Administrator",
      "Biomedical Engineer",
      "Clinical Engineer",
      "Biomedical Equipment Technician",
      "Biologist",
      "Pharmacologist",
      "Chemist",
      "Biochemist",
      "Formulation Scientist",
      "Toxicologist",
      "DMPK Scientist",
      "Clinical Research Scientist",
      "Clinical Research Associate",
      "Biostatistician",
      "Regulatory Affairs Specialist",
      "Medical Writer",
      "Health Product Manager",
      "Clinical Operations Manager",
      "Healthcare Compliance Manager",
      "Healthcare Quality Improvement Specialist"
    ],
    "Government & Non-Profit": [
      "Government Relations Manager",
      "Policy Analyst",
      "Program Manager",
      "Fundraising Coordinator",
      "Volunteer Coordinator"
    ]
  }
}

In [14]:
job_cate_list = []
for key,value in category['job_category'].items():
    job_cate_list.extend(category['job_category'][key])
print(job_cate_list)
# data = dict(category['job_category'])
# data.values()
print(len(job_cate_list))

['Backend Engineer', 'Java Engineer', 'Python Engineer', '.Net Engineer', 'C/C++ Engineer', 'Golang Engineer', 'Full Stack Engineer', 'Blockchain Engineer', 'Salesforce Developer', 'Frontend Software Engineer', 'React Developer', 'UI/UX Developer', 'iOS/Swift Developer', 'Android Developer', 'Flutter Developer', 'Unity Developer', 'Unreal Engine Developer', 'AR/VR Developer', 'Game Developer', 'Software Testing/Quality Assurance Engineer', 'Automation Test Engineer', 'QA Manager', 'Network Security Engineer', 'Cloud Security Engineer', 'Cyber Security Analyst', 'Cyber Security Engineer', 'Network Engineer', 'Systems Engineer', 'Site Reliability Engineer (SRE)', 'DevOps', 'SoC Analyst', 'IT Support Specialist', 'Help Desk Technician/Desktop Support Technician', 'System Administrator', 'Network Support Specialist', 'Salesforce Administrator', 'Database Administrator', 'Machine Learning Engineer', 'AI Engineer', 'LLM Engineer', 'Machine Learning/AI Researcher', 'Machine Learning, Deep Lea

In [28]:
import requests

cookies = {
    '_hjSessionUser_6388958': 'eyJpZCI6IjQxOGUwNzcwLWE3YzAtNTBmOC05NDE4LTAxYjQwNzFkNDYwZiIsImNyZWF0ZWQiOjE3NTA4Mjk4MjE5MTQsImV4aXN0aW5nIjpmYWxzZX0=',
    '_hjSession_6388958': 'eyJpZCI6IjYzNDlmYTVmLTlkNTEtNDU2YS04MDFiLWEwNTljNTRiYmY2NyIsImMiOjE3NTA4Mjk4MjE5MTUsInMiOjAsInIiOjAsInNiIjowLCJzciI6MCwic2UiOjAsImZzIjoxfQ==',
    '_uetsid': '6b81f720518611f0981ea504cec8251d',
    '_uetvid': '6b823070518611f09fcff1af0cf89f08',
    '_gcl_au': '1.1.1167952771.1750829822',
    '_clck': 'w0flwz%7C2%7Cfx2%7C0%7C2002',
    '_ga': 'GA1.1.1812497396.1750829822',
    '_ga_ETKKWETCJD': 'GS2.1.s1750829822$o1$g0$t1750829822$j60$l0$h928065022',
    '_tt_enable_cookie': '1',
    '_ttp': '01JYJSDX1B6KS3XCK7GN8QCHB8_.tt.1',
    'ttcsid': '1750829823024::gtewOAeumkry7EaMvHXD.1.1750829823024',
    '_clsk': 'wpanqj%7C1750829823973%7C1%7C1%7Cb.clarity.ms%2Fcollect',
    'ttcsid_CM0IJ53C77U0797CAP10': '1750829823023::jSB6o9Ve1MM-ssaV1TPo.1.1750829823975',
}

headers = {
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'en-US,en;q=0.9,en-IN;q=0.8',
    'baggage': 'sentry-environment=production,sentry-release=pigeon_production%40v0.0.819,sentry-public_key=5f46138160b2461b9e0fb4bb1cc803bc,sentry-trace_id=81177d03883d49d0861aa611e3092220,sentry-sample_rate=0.01,sentry-transaction=%2Fjobs%2F%5Bvisit%5D,sentry-sampled=false',
    'content-type': 'application/json',
    'dnt': '1',
    'origin': 'https://jobright.ai',
    'priority': 'u=1, i',
    'referer': 'https://jobright.ai/jobs/back',
    'sec-ch-ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Microsoft Edge";v="138"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'sentry-trace': '81177d03883d49d0861aa611e3092220-b7f6993eb0a7f9ee-0',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0',
    'x-client-type': 'web',
    # 'cookie': '_hjSessionUser_6388958=eyJpZCI6IjQxOGUwNzcwLWE3YzAtNTBmOC05NDE4LTAxYjQwNzFkNDYwZiIsImNyZWF0ZWQiOjE3NTA4Mjk4MjE5MTQsImV4aXN0aW5nIjpmYWxzZX0=; _hjSession_6388958=eyJpZCI6IjYzNDlmYTVmLTlkNTEtNDU2YS04MDFiLWEwNTljNTRiYmY2NyIsImMiOjE3NTA4Mjk4MjE5MTUsInMiOjAsInIiOjAsInNiIjowLCJzciI6MCwic2UiOjAsImZzIjoxfQ==; _uetsid=6b81f720518611f0981ea504cec8251d; _uetvid=6b823070518611f09fcff1af0cf89f08; _gcl_au=1.1.1167952771.1750829822; _clck=w0flwz%7C2%7Cfx2%7C0%7C2002; _ga=GA1.1.1812497396.1750829822; _ga_ETKKWETCJD=GS2.1.s1750829822$o1$g0$t1750829822$j60$l0$h928065022; _tt_enable_cookie=1; _ttp=01JYJSDX1B6KS3XCK7GN8QCHB8_.tt.1; ttcsid=1750829823024::gtewOAeumkry7EaMvHXD.1.1750829823024; _clsk=wpanqj%7C1750829823973%7C1%7C1%7Cb.clarity.ms%2Fcollect; ttcsid_CM0IJ53C77U0797CAP10=1750829823023::jSB6o9Ve1MM-ssaV1TPo.1.1750829823975',
}

params = {
    'sortCondition': '0',
    'count': '2000',
    'position': '0',
}

json_data = {
    'jobTitle': 'Backend Engineer',
    'city': 'Within US',
    'jobTypes': [
        1,
        2,
        3,
        4,
    ],
    'seniority': [
        1,
        2,
        3,
        4,
        5,
        6,
    ],
    'workModel': [
        1,
        2,
        3,
    ],
    'radiusRange': 1000,
    'position': 0,
    'count': 1000,
}

response = requests.post(
    'https://jobright.ai/swan/recommend/visitor-list/jobs',
    params=params,
    # cookies=cookies,
    headers=headers,
    json=json_data,
)

print(response)
# Note: json_data will not be serialized by requests
# exactly as it was in the original request.
#data = '{"jobTitle":"Back","city":"Within US","jobTypes":[1,2,3,4],"seniority":[1,2,3,4,5,6],"workModel":[1,2,3],"radiusRange":50,"position":20,"count":20}'
#response = requests.post(
#    'https://jobright.ai/swan/recommend/visitor-list/jobs',
#    params=params,
#    cookies=cookies,
#    headers=headers,
#    data=data,
#)

<Response [200]>


In [29]:
import json
data = dict(response.json())
print(json.dumps(data))



In [30]:
print(len(data['result']['jobList']))
data['result']['jobList'][0]['jobResult']['jobId']


1557


'685db0ade59184cc073340c7'