<a href="https://colab.research.google.com/github/chikilivighneshshastry/colab_files/blob/main/jobright_data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# extract fingle page info from jonright


In [None]:
import aiohttp
import asyncio

async def extract_job_data(url):
  async with aiohttp.ClientSession() as session:
    response = await session.get(url)
    html = await response.text()
    print(html)
    return response

url = 'https://jobright.ai/jobs/info/685a54b5be2d7e56476268d'
# url = 'https://jobright.ai/jobs/info'
response = await extract_job_data(url)

In [None]:
response.status
html_data = await response.text()
print(html_data)

In [None]:
# prompt: parse html_data using bs4 and get with text in a id

from bs4 import BeautifulSoup
import json
soup = BeautifulSoup(html_data, 'html.parser')

# Assuming the text you want is within an element with a specific ID,
# replace 'your_element_id' with the actual ID of the element.
element_with_id = soup.find(id='__NEXT_DATA__')

if element_with_id:
  detailed_json_data = element_with_id.get_text()
  print(detailed_json_data)
else:
  print("Element with the specified ID not found.")

data = json.loads(detailed_json_data)
print(data)


In [None]:
data.keys()
print(data['props'].keys())
print(data['page'])
print(data['query'])
print(data['buildId'])
print(data['isFallback'])
print(data['gssp'])
print(data['scriptLoader'])

In [None]:
print(data['props']['pageProps']['baseSalary'])
print(data['props']['pageProps']['jobLocation'])
print(data['props']['pageProps']['logined'])
print(data['props']['pageProps']['jobHashedId'])
print(data['props']['pageProps']['_sentryTraceData'])
print(data['props']['pageProps']['_sentryBaggage'])

In [None]:
data['props']['pageProps']['dataSource']

In [None]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import random

# --- Configuration ---
TARGET_SITE_DOMAIN = "example.com" # To keep crawling within the site
INITIAL_SEED_URL = f"http://{TARGET_SITE_DOMAIN}"

PROXY_LIST = [
    "http://proxy1.com:8080",
    "http://user:pass@proxy2.com:3128",
    # ... more proxies
]

# --- Database (Conceptual - replace with actual DB interaction) ---
# In a real scenario, use libraries like psycopg2 (PostgreSQL), mysql.connector, sqlite3, or an ORM like SQLAlchemy
DATABASE_URLS_SEEN = set() # Simple in-memory set for this example; use a real DB!

async def db_url_exists(url):
    # Simulate DB check
    return url in DATABASE_URLS_SEEN

async def db_add_url(url):
    # Simulate DB add
    DATABASE_URLS_SEEN.add(url)
    print(f"[DB] Added: {url}")

# --- Crawler Components ---
url_frontier = asyncio.Queue()
processed_urls_count = 0
MAX_URLS_TO_CRAWL = 100 # Example limit

async def fetch(session, url, proxy):
    try:
        print(f"[FETCHING] {url} via proxy {proxy if proxy else 'DIRECT'}")
        async with session.get(url, proxy=proxy, timeout=10, ssl=False) as response: # Added ssl=False for potential local SSL issues
            if response.status == 200:
                return await response.text()
            else:
                print(f"[ERROR] HTTP {response.status} for {url}")
                return None
    except Exception as e:
        print(f"[ERROR] Failed to fetch {url}: {e}")
        return None

def get_proxy():
    if PROXY_LIST:
        return random.choice(PROXY_LIST)
    return None

def parse_and_extract_links(html_content, base_url):
    links = set()
    if not html_content:
        return links
    soup = BeautifulSoup(html_content, 'lxml') # 'html.parser' is a built-in alternative
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        # Join relative URLs with the base URL
        full_url = urljoin(base_url, href)
        # Basic clean-up (remove fragment, normalize)
        parsed_url = urlparse(full_url)
        normalized_url = parsed_url._replace(fragment="").geturl()

        # Filter: Only crawl URLs from the target domain
        if urlparse(normalized_url).netloc == TARGET_SITE_DOMAIN:
            links.add(normalized_url)
    return links

async def worker(name, session):
    global processed_urls_count
    while True:
        try:
            current_url = await url_frontier.get()
            print(f"[{name}] Processing: {current_url}")

            if await db_url_exists(current_url):
                print(f"[{name}] Already processed/in DB: {current_url}")
                url_frontier.task_done()
                continue

            await db_add_url(current_url) # Add to DB before fetching (or mark as being processed)

            # TODO: Implement robots.txt check here

            proxy = get_proxy()
            html = await fetch(session, current_url, proxy)

            if html:
                new_links = parse_and_extract_links(html, current_url)
                for link in new_links:
                    if not await db_url_exists(link) and processed_urls_count < MAX_URLS_TO_CRAWL :
                        # Check DB again before adding to frontier to handle race conditions if multiple workers find same link
                        if link not in DATABASE_URLS_SEEN: # Simplified check; real DB would handle uniqueness
                            await url_frontier.put(link)
                            print(f"[{name}] Queued new link: {link}")


                processed_urls_count += 1
                if processed_urls_count >= MAX_URLS_TO_CRAWL:
                    print(f"[{name}] Reached max URL limit. Draining queue...")
                    # Allow other tasks to finish current work, then stop adding new ones.
                    # Or more abruptly, cancel other tasks.

            url_frontier.task_done()

            if processed_urls_count >= MAX_URLS_TO_CRAWL and url_frontier.empty():
                break # Exit worker if limit reached and queue is empty

            await asyncio.sleep(1) # Be respectful: add a small delay

        except Exception as e:
            print(f"[{name}] Error in worker: {e}")
            url_frontier.task_done() # Ensure task_done is called even on error
            continue # Continue to next URL

async def main():
    await url_frontier.put(INITIAL_SEED_URL)
    await db_add_url(INITIAL_SEED_URL) # Add seed to DB initially

    # You might want a ClientSession per proxy type or a more sophisticated setup
    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: # ssl=False for local dev; use proper SSL context in prod
        # Create worker tasks
        num_workers = 5 # Number of concurrent crawlers
        tasks = []
        for i in range(num_workers):
            task = asyncio.create_task(worker(f"Worker-{i+1}", session))
            tasks.append(task)

        # Wait for the queue to be processed or limit to be reached
        await url_frontier.join() # Waits until all items in queue are gotten and processed

        # If max URLs reached, there might still be items in the queue
        # or workers might be processing. We need a way to signal them to stop gracefully.
        # For simplicity here, we cancel tasks if max_urls is hit and queue is effectively drained by workers.
        if processed_urls_count >= MAX_URLS_TO_CRAWL:
            print("Max URL limit reached. Cancelling worker tasks...")

        for task in tasks:
            task.cancel() # Cancel all worker tasks

        await asyncio.gather(*tasks, return_exceptions=True) # Wait for tasks to be cancelled

    print("Crawling finished.")
    print(f"Total unique URLs seen (from in-memory set): {len(DATABASE_URLS_SEEN)}")

if __name__ == "__main__":
    asyncio.run(main())

In [None]:
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy

strategy = BFSDeepCrawlStrategy(
    max_depth=2,               # Crawl initial page + 2 levels deep
    include_external=False,    # Stay within the same domain
    max_pages=50,              # Maximum number of pages to crawl (optional)
    score_threshold=0.3,       # Minimum score for URLs to be crawled (optional)
)

In [None]:
!pip install crawl4ai

In [None]:
!crawl4ai-setup

In [None]:
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.deep_crawling.filters import (
    FilterChain,
    DomainFilter,
    URLPatternFilter,
    ContentTypeFilter
)
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer

async def run_advanced_crawler():
    # Create a sophisticated filter chain
    filter_chain = FilterChain([
        # Domain boundaries
        DomainFilter(
            allowed_domains=["jobright.ai"]
            # blocked_domains=["old.docs.example.com"]
        ),

        # URL patterns to include
        # URLPatternFilter(patterns=["*guide*", "*tutorial*", "*blog*"]),

        # Content type filtering
        ContentTypeFilter(allowed_types=["text/html"])
    ])


    # Set up the configuration
    config = CrawlerRunConfig(
        deep_crawl_strategy =BFSDeepCrawlStrategy()


        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=2,
            include_external=False,
            filter_chain=filter_chain
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        stream=True,
        verbose=True
    )

    # Execute the crawl
    results = []
    async with AsyncWebCrawler() as crawler:
        async for result in await crawler.arun("https://jobright.ai/jobs/info", config=config):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")

    # Analyze the results
    print(f"Crawled {len(results)} high-value pages")
    print(f"Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}")

    # Group by depth
    depth_counts = {}
    for result in results:
        depth = result.metadata.get("depth", 0)
        depth_counts[depth] = depth_counts.get(depth, 0) + 1

    print("Pages crawled by depth:")
    for depth, count in sorted(depth_counts.items()):
        print(f"  Depth {depth}: {count} pages")
    return results
if __name__ == "__main__":
    results = await run_advanced_crawler()


In [None]:
base_url =''
end_word_categorys = []
parmeters = []
result_urls = []
for name in end_categorys:
  for para in parmeters:
    url = prepare_url(base_url,name,para)
    result_urls = get_all_jobpost_urls(url)
    save_to_db(result_urls)



In [None]:
!pip install scrapy

In [None]:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings # Useful for default settings

# This is often needed if running in Jupyter/IPython to avoid "ReactorNotRestartable"
import nest_asyncio
nest_asyncio.apply()

# --- Your Spider Definition ---
class AllLinksSpider(CrawlSpider):
    name = 'all_links_scraper_cell' # Changed name slightly to avoid clashes if you have the other one
    allowed_domains = ['jobright.ai']
    # start_urls = ['https://jobright.ai/jobs/backenddeveloper/']
    start_urls = ['https://jobright.ai/jobs/backenddeveloper'] # More general starting point
    # Counter for processed URLs or items
    processed_url_count = 0
    MAX_URLS_TO_PROCESS = 100
    # Custom settings for this spider if needed (can also be passed to CrawlerProcess)
    custom_settings = {
        'LOG_LEVEL': 'INFO', # 'DEBUG' for more verbosity
        'DOWNLOAD_DELAY': 0.25,  # Be respectful
        'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
        # 'DEPTH_LIMIT': 2 # Uncomment to limit crawl depth
    }

    rules = (
        Rule(
            LinkExtractor(
                allow_domains=['jobright.ai'],
                deny=(
                    r'/login', r'/register', r'/password', # Example patterns to avoid
                    r'mailto:', r'tel:', # Avoid mail and tel links
                )
            ),
            callback='parse_page_links',
            follow=True
        ),
    )

    def __init__(self, *args, **kwargs):
        super(AllLinksSpider, self).__init__(*args, **kwargs)
        # Get the list passed from the CrawlerProcess or default to an empty list
        self.collected_links_list = kwargs.get('output_list', [])

    def parse_page_links(self, response):
        self.logger.info(f"Processing page: {response.url}")
        links_on_this_page = response.css('a::attr(href)').getall()

        for link_href in links_on_this_page:
            absolute_link = response.urljoin(link_href)
            # Check if it's within allowed domains again, just to be safe if LinkExtractor somehow missed
            if self.allowed_domains and any(domain in scrapy.utils.url.get_domain(absolute_link) for domain in self.allowed_domains):
                link_data = {
                    'source_page_url': response.url,
                    'extracted_link': absolute_link
                }
                # Append to the list provided during initialization
                self.collected_links_list.append(link_data)
                # Still yield if you want to use Scrapy's feed exporters or other pipelines
                yield link_data

In [None]:
# This list will be populated by the spider
scraped_links_data = []

# --- Configure and Run the Crawler ---
# Get project settings if you have a settings.py, otherwise, it provides defaults
settings = get_project_settings()

# Override or add settings
settings.set('USER_AGENT', 'MyCustomBot/1.0 (+http://mywebsite.com/botinfo)')
# If you want to output to a file as well using Scrapy's feed exporters:
# settings.set('FEEDS', {
#     'output_links.json': {'format': 'json', 'overwrite': True},
# })

# Create a CrawlerProcess
# The 'settings' argument can be a Settings object or a dictionary
process = CrawlerProcess(settings=settings)

# Pass the list to the spider instance when scheduling it
# The spider's __init__ will pick up 'output_list' from kwargs
process.crawl(AllLinksSpider, output_list=scraped_links_data)

# The script will block here until all crawling is finished
print("Starting Scrapy process...")
process.start()
print("Scrapy process finished.")

# --- Now print the collected links ---
print(f"\n--- Collected {len(scraped_links_data)} link entries: ---")
unique_extracted_links = set()
for item in scraped_links_data:
    print(f"From: {item['source_page_url']} -> Found: {item['extracted_link']}")
    unique_extracted_links.add(item['extracted_link'])

print(f"\n--- {len(unique_extracted_links)} Unique Extracted Links: ---")
for link in sorted(list(unique_extracted_links)): # Print sorted unique links
    print(link)

# If you want just a flat list of the unique extracted URLs:
final_unique_links_list = sorted(list(unique_extracted_links))
# print("\nFinal flat list of unique links:")
# print(final_unique_links_list)

In [None]:
!pip install nest_asyncio

In [21]:
import nest_asyncio
nest_asyncio.apply() # <--- ADD THIS LINE AT THE TOP

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# --- Your Spider Definition (AllLinksSpider or similar) ---
class AllLinksSpider(CrawlSpider):
    name = 'all_links_scraper_cell'
    allowed_domains = ['jobright.ai']
    start_urls = ['https://jobright.ai/jobs/']

    custom_settings = {
        'LOG_LEVEL': 'INFO',
        'DOWNLOAD_DELAY': 0.25,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
        'CLOSESPIDER_ITEMCOUNT': 20 # Example limit: stop after 20 items
    }

    rules = (
        Rule(
            LinkExtractor(
                allow_domains=['jobright.ai'],
                deny=(r'/login', r'/register', r'/password', r'mailto:', r'tel:')
            ),
            callback='parse_page_links',
            follow=True
        ),
    )

    def __init__(self, *args, **kwargs):
        super(AllLinksSpider, self).__init__(*args, **kwargs)
        self.collected_links_list = kwargs.get('output_list', [])

    def parse_page_links(self, response):
        # self.logger.info(f"Processing page: {response.url}") # Keep if needed
        links_on_this_page = response.css('a::attr(href)').getall()

        for link_href in links_on_this_page:
            absolute_link = response.urljoin(link_href)
            if self.allowed_domains and any(domain in scrapy.utils.url.get_domain(absolute_link) for domain in self.allowed_domains):
                link_data = {
                    'source_page_url': response.url,
                    'extracted_link': absolute_link
                }
                self.collected_links_list.append(link_data)
                yield link_data # Yielding is good for CLOSESPIDER_ITEMCOUNT

# --- Script to Run the Spider ---
scraped_links_data = []
settings = get_project_settings()
settings.set('USER_AGENT', 'MyCustomBot/1.0 (+http://mywebsite.com/botinfo)')

process = CrawlerProcess(settings=settings)
process.crawl(AllLinksSpider, output_list=scraped_links_data)

print("Starting Scrapy process...")
process.start() # This should now work multiple times in the same session
print("Scrapy process finished.")

print(f"\n--- Collected {len(scraped_links_data)} link entries (first 10 shown if many): ---")
for i, item in enumerate(scraped_links_data):
    if i < 10: # Print only the first 10 for brevity
        print(f"From: {item['source_page_url']} -> Found: {item['extracted_link']}")
    elif i == 10:
        print("... and more.")
        break

unique_extracted_links = set(item['extracted_link'] for item in scraped_links_data)
print(f"\n--- {len(unique_extracted_links)} Unique Extracted Links (first 10 shown if many): ---")
for i, link in enumerate(sorted(list(unique_extracted_links))):
    if i < 10:
        print(link)
    elif i == 10:
        print("... and more.")
        break

INFO:scrapy.utils.log:Scrapy 2.13.2 started (bot: scrapybot)
2025-06-24 15:39:24 [scrapy.utils.log] INFO: Scrapy 2.13.2 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '25.5.0',
 'Python': '3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]',
 'pyOpenSSL': '25.1.0 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
2025-06-24 15:39:24 [scrapy.utils.log] INFO: Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '25.5.0',
 'Python': '3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]',
 'pyOpenSSL': '25.1.0 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
INFO:scrapy.addons:Enabled addons:
[]
2025-06-24 15:39:24 [scrapy.addons] INFO: Enabled addons:
[]
DEBUG:scrapy.utils.log:Usin

Starting Scrapy process...


ReactorNotRestartable: 