In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import random
import math
from datetime import datetime
import os
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font

In [2]:
def build_indeed_url(query="data analyst", location="Bengaluru", days=3, job_type="fulltime", sort="date", page=0):
    start = page * 10
    base_url = "https://in.indeed.com/jobs"
    url = (
        f"{base_url}?q={query.replace(' ', '+')}"
        f"&l={location.replace(' ', '+')}"
        f"&fromage={days}"
        f"&jt={job_type}"
        f"&sort={sort}"
        f"&start={start}"
    )
    return url

def get_total_pages_from_meta(driver, jobs_per_page):
    import math
    try:
        meta = driver.find_element(By.XPATH, "//meta[@name='description']")
        content = meta.get_attribute("content")  # e.g., "90 Data Analyst jobs available..."

        job_count = int(''.join(filter(str.isdigit, content.split()[0])))  # Extract "90"
        total_pages = math.ceil(job_count / jobs_per_page)
        print(f"📊 Found {job_count} jobs → Scrape {total_pages} pages")
        return total_pages

    except Exception as e:
        print(f"⚠️ Could not extract job count from meta tag. Defaulting to 1 page. Error: {e}")
        return 1
        
def get_indeed_jobs_reliable(query="data analyst", location="Bengaluru"):
    options = uc.ChromeOptions()
    options.add_argument("--start-maximized")
    driver = uc.Chrome(options=options)
    wait = WebDriverWait(driver, 10)
    jobs = []

    driver.get(build_indeed_url(query, location, days=3, job_type="fulltime", sort="date", page=0))
    time.sleep(10)
    pages = get_total_pages_from_meta(driver,15)
    # pages = 1 #debug
    for page in range(pages):
        url = build_indeed_url(query, location, days=3, job_type="fulltime", sort="date", page=page)
        print(f"🔄 Page {page + 1}: {url}")
        driver.get(url)
        time.sleep(5)  # Give time to load new page
        
        # ✅ REFRESH job_cards for this page
        job_cards = driver.find_elements(By.CLASS_NAME, "tapItem")
    
        for index, card in enumerate(job_cards):
            try:
                # Scroll each card into view again
                driver.execute_script("arguments[0].scrollIntoView(true);", card)
                card.click()
                time.sleep(2)
    
                wait.until(EC.presence_of_element_located((By.ID, "jobDescriptionText")))
                description = driver.find_element(By.ID, "jobDescriptionText").text.strip()
    
                try:
                    title = card.find_element(By.XPATH, ".//h2[contains(@class,'jobTitle')]").text.strip()
                except:
                    title = "N/A"
    
                try:
                    company = driver.find_element(By.XPATH, "//div[@id='jobDescriptionText']/preceding::a[1]").text.strip()
                except:
                    company = "N/A"
                try:
                    job_link = card.find_element(By.XPATH, ".//a[@data-jk]")
                    job_key = job_link.get_attribute("data-jk")
                    job_url = f"https://in.indeed.com/viewjob?jk={job_key}"
                except Exception as e:
                    job_url = "N/A"
                    print(f"❌ Could not extract job key: {e}")
                
                jobs.append({
                    "title": title,
                    "company": company,
                    "description": description,
                    "Link":job_url
                })
    
            except Exception as e:
                print(f"❌ Error on card {index + 1} (Page {page + 1}): {e}")
                continue
    
        time.sleep(random.uniform(3, 5))  # ✅ Slow down between pages

 
    return pd.DataFrame(jobs)

In [3]:
df = get_indeed_jobs_reliable("data analyst", "Bengaluru")

📊 Found 68 jobs → Scrape 5 pages
🔄 Page 1: https://in.indeed.com/jobs?q=data+analyst&l=Bengaluru&fromage=3&jt=fulltime&sort=date&start=0
🔄 Page 2: https://in.indeed.com/jobs?q=data+analyst&l=Bengaluru&fromage=3&jt=fulltime&sort=date&start=10
🔄 Page 3: https://in.indeed.com/jobs?q=data+analyst&l=Bengaluru&fromage=3&jt=fulltime&sort=date&start=20
🔄 Page 4: https://in.indeed.com/jobs?q=data+analyst&l=Bengaluru&fromage=3&jt=fulltime&sort=date&start=30
🔄 Page 5: https://in.indeed.com/jobs?q=data+analyst&l=Bengaluru&fromage=3&jt=fulltime&sort=date&start=40


In [6]:
# Create a new workbook and worksheet
wb = Workbook()
ws = wb.active
ws.title = "Jobs"

# Write headers
ws.append(df.columns.tolist())

# Write job data row-by-row
for row in df.itertuples(index=False):
    ws.append(list(row))

# Convert 'apply_link' column to clickable "Apply" text
link_col = df.columns.get_loc("Link") + 1  # Excel columns are 1-based

for row in ws.iter_rows(min_row=2, min_col=link_col, max_col=link_col):
    for cell in row:
        cell.hyperlink = cell.value  # set hyperlink target
        cell.value = cell.value        # display text
        cell.font = Font(color="0000FF", underline="single")

# Save the Excel file
wb.save(filename = f"indeed_jobs_{datetime.now().strftime('%Y-%m-%d_%H-%M')}.xlsx")
print("✅ File saved")

✅ File saved


In [5]:
os.getcwd()

'C:\\Users\\DELL'