<a href="https://colab.research.google.com/github/danielbehargithub/LinkedIn_Salary/blob/main/Salary_Scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install playwright
!playwright install
!pip install anticaptchaofficial


In [3]:
from playwright.async_api import async_playwright
import pandas as pd
import asyncio

async def scrape_payscale():
    async with async_playwright() as p:
        # Launching a headless Chromium browser
        browser = await p.chromium.launch(headless=True)
        # Creating a browser context with a custom User-Agent
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        )
        # Setting a default timeout for all operations
        context.set_default_timeout(10000)
        page = await context.new_page()

        # Navigating to the target URL
        url = "https://www.payscale.com/research/US/Employer=Intel_Corporation/Salary"
        print("Navigating to URL...")
        response = await page.goto(url)
        # Check if the page was loaded successfully
        if response.status != 200:
            print(f"Failed to load page. Status code: {response.status}")
            return

        # Adding a sleep to ensure the page is fully loaded
        print("Waiting for page to load...")
        await asyncio.sleep(5)

        # Saving the page's HTML content for debugging purposes
        html = await page.content()
        with open("page_debug.html", "w", encoding="utf-8") as f:
            f.write(html)
        print("Saved page content to page_debug.html")

        # Taking a screenshot of the page for verification
        await page.screenshot(path="screenshot.png", full_page=True)
        print("Screenshot saved as screenshot.png")

        # Extracting table rows using a specific selector
        print("Extracting table rows...")
        rows = await page.query_selector_all("tr.data-table__row")
        print(f"Found {len(rows)} rows in the table.")

        data = []
        # Iterating through each table row to extract job and salary information
        for index, row in enumerate(rows):
            print(f"Processing row {index + 1}...")
            # Extracting columns (td elements) within the row
            cols = await row.query_selector_all("td")
            if len(cols) >= 3:  # Ensure the row has at least 3 columns
                job_title = await cols[0].inner_text()
                salary_range = await cols[1].inner_text()
                salary_average = await cols[2].inner_text()
                data.append((job_title.strip(), salary_range.strip(), salary_average.strip()))
            else:
                print(f"Skipping row {index + 1} due to insufficient columns.")

        # Closing the browser
        await browser.close()

        # Saving the extracted data into a CSV file
        if data:
            df = pd.DataFrame(data, columns=["Job Title", "Salary Range", "Salary Average"])
            df.to_csv("intel_salaries.csv", index=False)
            print("Data saved to intel_salaries.csv")
        else:
            print("No data found to save.")

# Running the async scraping function
await scrape_payscale()


Navigating to URL...
Waiting for page to load...
Saved page content to page_debug.html
Screenshot saved as screenshot.png
Extracting table rows...
Found 7 rows in the table.
Processing row 1...
Processing row 2...
Processing row 3...
Processing row 4...
Processing row 5...
Processing row 6...
Processing row 7...
Data saved to intel_salaries.csv
