In [4]:
"""
Resume Screening AI - Data Scraper
Sources: LinkedIn Jobs, Indeed Jobs, Kaggle Datasets
Output: JSON files in /data folder
"""

import os
import json
import time
import random
import requests
import logging
from datetime import datetime
from pathlib import Path

# ===================== SETUP =====================

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s'
)
log = logging.getLogger(__name__)

# Output folders
BASE_DIR = Path("data")
LINKEDIN_DIR = BASE_DIR / "linkedin"
INDEED_DIR = BASE_DIR / "indeed"
KAGGLE_DIR = BASE_DIR / "kaggle"

for d in [LINKEDIN_DIR, INDEED_DIR, KAGGLE_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Headers — real browser jaisi lagni chahiye
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

def random_delay(min_sec=2, max_sec=5):
    """Ban hone se bachne ke liye random delay"""
    time.sleep(random.uniform(min_sec, max_sec))

def save_json(data, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    log.info(f"Saved: {filepath}")


# ===================== 1. LINKEDIN SCRAPER =====================

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def get_driver(headless=True):
    """Chrome driver setup"""
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option("useAutomationExtension", False)
    opts.add_argument(f"user-agent={HEADERS['User-Agent']}")
    driver = webdriver.Chrome(options=opts)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver


LINKEDIN_JOBS = [
    "software engineer",
    "data scientist",
    "machine learning engineer",
    "python developer",
    "backend developer",
    "frontend developer",
    "devops engineer",
    "data analyst",
    "full stack developer",
    "AI engineer",
]

def scrape_linkedin(max_per_keyword=100):
    """
    LinkedIn public job listings scrape karta hai (login nahi chahiye)
    """
    log.info("=== LinkedIn Scraper Start ===")
    driver = get_driver(headless=True)
    all_jobs = []
    
    try:
        for keyword in LINKEDIN_JOBS:
            log.info(f"LinkedIn: Searching '{keyword}'")
            keyword_jobs = []
            start = 0
            
            while len(keyword_jobs) < max_per_keyword:
                url = (
                    f"https://www.linkedin.com/jobs/search?"
                    f"keywords={keyword.replace(' ', '%20')}"
                    f"&location=&start={start}"
                )
                
                try:
                    driver.get(url)
                    random_delay(3, 6)
                    
                    # Job cards load hone ka wait
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "jobs-search__results-list"))
                    )
                    
                    job_cards = driver.find_elements(By.CSS_SELECTOR, "li.jobs-search__results-list > div")
                    
                    if not job_cards:
                        log.info(f"No more results for '{keyword}'")
                        break
                    
                    for card in job_cards:
                        try:
                            job = {}
                            
                            # Title
                            try:
                                job["title"] = card.find_element(
                                    By.CSS_SELECTOR, "h3.base-search-card__title"
                                ).text.strip()
                            except NoSuchElementException:
                                job["title"] = ""
                            
                            # Company
                            try:
                                job["company"] = card.find_element(
                                    By.CSS_SELECTOR, "h4.base-search-card__subtitle"
                                ).text.strip()
                            except NoSuchElementException:
                                job["company"] = ""
                            
                            # Location
                            try:
                                job["location"] = card.find_element(
                                    By.CSS_SELECTOR, "span.job-search-card__location"
                                ).text.strip()
                            except NoSuchElementException:
                                job["location"] = ""
                            
                            # Job URL
                            try:
                                job["url"] = card.find_element(
                                    By.CSS_SELECTOR, "a.base-card__full-link"
                                ).get_attribute("href")
                            except NoSuchElementException:
                                job["url"] = ""
                            
                            # Posted date
                            try:
                                job["posted"] = card.find_element(
                                    By.CSS_SELECTOR, "time"
                                ).get_attribute("datetime")
                            except NoSuchElementException:
                                job["posted"] = ""
                            
                            job["keyword"] = keyword
                            job["source"] = "linkedin"
                            job["scraped_at"] = datetime.now().isoformat()
                            
                            if job["title"]:  # sirf valid jobs save karo
                                keyword_jobs.append(job)
                                
                        except Exception as e:
                            log.warning(f"Card parse error: {e}")
                            continue
                    
                    # Job detail page se description bhi lo
                    keyword_jobs = _linkedin_get_descriptions(driver, keyword_jobs)
                    
                    start += 25  # LinkedIn ka pagination
                    random_delay(2, 4)
                    
                except TimeoutException:
                    log.warning(f"Timeout for keyword '{keyword}', moving on")
                    break
                except Exception as e:
                    log.error(f"LinkedIn error: {e}")
                    break
            
            all_jobs.extend(keyword_jobs)
            log.info(f"LinkedIn '{keyword}': {len(keyword_jobs)} jobs scraped")
            
            # Save per keyword
            fname = LINKEDIN_DIR / f"{keyword.replace(' ', '_')}.json"
            save_json(keyword_jobs, fname)
            
            random_delay(5, 10)  # keywords ke beech zyada wait
    
    finally:
        driver.quit()
    
    # All jobs ek file mein bhi
    save_json(all_jobs, LINKEDIN_DIR / "all_linkedin_jobs.json")
    log.info(f"LinkedIn Total: {len(all_jobs)} jobs")
    return all_jobs


def _linkedin_get_descriptions(driver, jobs, max_desc=20):
    """Job detail pages se description fetch karo"""
    count = 0
    for job in jobs:
        if count >= max_desc:
            break
        if not job.get("url") or job.get("description"):
            continue
        try:
            driver.get(job["url"])
            random_delay(2, 4)
            desc_el = driver.find_element(
                By.CSS_SELECTOR, "div.show-more-less-html__markup"
            )
            job["description"] = desc_el.text.strip()
            
            # Requirements extract karne ki koshish
            job["requirements"] = _extract_requirements(job["description"])
            count += 1
        except Exception:
            job["description"] = ""
    return jobs


def _extract_requirements(text):
    """Description se requirements section nikalo"""
    requirements = []
    lines = text.split("\n")
    in_req = False
    
    req_keywords = ["requirement", "qualifications", "what you need", "skills", "you have"]
    
    for line in lines:
        line_lower = line.lower()
        if any(kw in line_lower for kw in req_keywords):
            in_req = True
        elif in_req and line.startswith("•") or line.startswith("-"):
            requirements.append(line.strip("•- ").strip())
        elif in_req and len(line) < 3:
            in_req = False
    
    return requirements


# ===================== 2. INDEED SCRAPER =====================

from bs4 import BeautifulSoup

INDEED_JOBS = [
    "software engineer",
    "data scientist",
    "machine learning",
    "python developer",
    "backend developer",
    "frontend developer",
    "devops",
    "data analyst",
    "full stack developer",
    "artificial intelligence",
    "cloud engineer",
    "java developer",
    "react developer",
    "nodejs developer",
]

def scrape_indeed(max_per_keyword=200):
    """
    Indeed se job listings scrape karta hai
    """
    log.info("=== Indeed Scraper Start ===")
    session = requests.Session()
    session.headers.update(HEADERS)
    all_jobs = []
    
    for keyword in INDEED_JOBS:
        log.info(f"Indeed: Searching '{keyword}'")
        keyword_jobs = []
        start = 0
        
        while len(keyword_jobs) < max_per_keyword:
            url = (
                f"https://www.indeed.com/jobs?"
                f"q={keyword.replace(' ', '+')}"
                f"&start={start}"
                f"&limit=50"
            )
            
            try:
                resp = session.get(url, timeout=15)
                
                if resp.status_code == 403:
                    log.warning("Indeed blocked! Switching to Selenium...")
                    keyword_jobs.extend(
                        _indeed_selenium(keyword, max_per_keyword - len(keyword_jobs))
                    )
                    break
                
                if resp.status_code != 200:
                    log.warning(f"Indeed status: {resp.status_code}")
                    break
                
                soup = BeautifulSoup(resp.text, "html.parser")
                
                # Job cards
                job_cards = soup.find_all("div", class_="job_seen_beacon")
                
                if not job_cards:
                    # Alternative selectors try karo
                    job_cards = soup.find_all("td", class_="resultContent")
                
                if not job_cards:
                    log.info(f"No more results for '{keyword}' at start={start}")
                    break
                
                for card in job_cards:
                    try:
                        job = {}
                        
                        # Title
                        title_el = card.find("h2", class_="jobTitle")
                        job["title"] = title_el.get_text(strip=True) if title_el else ""
                        
                        # Company
                        company_el = card.find("span", {"data-testid": "company-name"})
                        if not company_el:
                            company_el = card.find("span", class_="companyName")
                        job["company"] = company_el.get_text(strip=True) if company_el else ""
                        
                        # Location
                        loc_el = card.find("div", {"data-testid": "text-location"})
                        if not loc_el:
                            loc_el = card.find("div", class_="companyLocation")
                        job["location"] = loc_el.get_text(strip=True) if loc_el else ""
                        
                        # Salary
                        salary_el = card.find("div", class_="metadata salary-snippet-container")
                        job["salary"] = salary_el.get_text(strip=True) if salary_el else ""
                        
                        # Summary
                        summary_el = card.find("div", class_="job-snippet")
                        job["summary"] = summary_el.get_text(strip=True) if summary_el else ""
                        
                        # Job link
                        link_el = card.find("a", class_="jcs-JobTitle")
                        if link_el:
                            job["url"] = "https://www.indeed.com" + link_el.get("href", "")
                            job["job_id"] = link_el.get("data-jk", "")
                        else:
                            job["url"] = ""
                            job["job_id"] = ""
                        
                        job["keyword"] = keyword
                        job["source"] = "indeed"
                        job["scraped_at"] = datetime.now().isoformat()
                        
                        if job["title"]:
                            keyword_jobs.append(job)
                    
                    except Exception as e:
                        log.warning(f"Indeed card error: {e}")
                        continue
                
                start += 50
                random_delay(3, 6)
                
            except requests.RequestException as e:
                log.error(f"Indeed request error: {e}")
                break
        
        # Job descriptions fetch karo (top 30 per keyword)
        keyword_jobs = _indeed_get_descriptions(session, keyword_jobs, max_desc=30)
        
        all_jobs.extend(keyword_jobs)
        log.info(f"Indeed '{keyword}': {len(keyword_jobs)} jobs")
        
        # Save per keyword
        fname = INDEED_DIR / f"{keyword.replace(' ', '_')}.json"
        save_json(keyword_jobs, fname)
        
        random_delay(5, 10)
    
    save_json(all_jobs, INDEED_DIR / "all_indeed_jobs.json")
    log.info(f"Indeed Total: {len(all_jobs)} jobs")
    return all_jobs


def _indeed_get_descriptions(session, jobs, max_desc=30):
    """Indeed job detail pages se full description lo"""
    count = 0
    for job in jobs:
        if count >= max_desc:
            break
        if not job.get("url"):
            continue
        try:
            resp = session.get(job["url"], timeout=10)
            if resp.status_code != 200:
                continue
            soup = BeautifulSoup(resp.text, "html.parser")
            
            desc_el = soup.find("div", id="jobDescriptionText")
            if not desc_el:
                desc_el = soup.find("div", class_="jobsearch-jobDescriptionText")
            
            if desc_el:
                job["description"] = desc_el.get_text(separator="\n", strip=True)
                job["requirements"] = _extract_requirements(job["description"])
            
            count += 1
            random_delay(1, 3)
        
        except Exception as e:
            log.warning(f"Indeed desc error: {e}")
    
    return jobs


def _indeed_selenium(keyword, max_results=100):
    """Fallback: Selenium se Indeed scrape karo"""
    log.info(f"Indeed Selenium fallback for '{keyword}'")
    driver = get_driver(headless=True)
    jobs = []
    
    try:
        url = f"https://www.indeed.com/jobs?q={keyword.replace(' ', '+')}"
        driver.get(url)
        random_delay(3, 5)
        
        job_cards = driver.find_elements(By.CSS_SELECTOR, "div.job_seen_beacon")
        
        for card in job_cards[:max_results]:
            try:
                job = {}
                try:
                    job["title"] = card.find_element(By.CSS_SELECTOR, "h2.jobTitle").text
                except: job["title"] = ""
                try:
                    job["company"] = card.find_element(By.CSS_SELECTOR, "[data-testid='company-name']").text
                except: job["company"] = ""
                try:
                    job["location"] = card.find_element(By.CSS_SELECTOR, "[data-testid='text-location']").text
                except: job["location"] = ""
                
                job["keyword"] = keyword
                job["source"] = "indeed_selenium"
                job["scraped_at"] = datetime.now().isoformat()
                
                if job["title"]:
                    jobs.append(job)
            except:
                continue
    finally:
        driver.quit()
    
    return jobs


# ===================== 3. KAGGLE DOWNLOADER =====================

import subprocess
import zipfile
import shutil

# Yeh datasets download karenge
KAGGLE_DATASETS = [
    {
        "id": "gauravduttakiit/resume-dataset",
        "name": "resume_dataset",
        "description": "2400+ categorized resumes"
    },
    {
        "id": "ravindrasinghrana/job-description-dataset",
        "name": "job_descriptions",
        "description": "Real job descriptions"
    },
    {
        "id": "arshkon/linkedin-job-postings",
        "name": "linkedin_postings",
        "description": "LinkedIn job postings 2023-2024"
    },
    {
        "id": "snehaanbhawal/resume-entities-for-ner",
        "name": "resume_ner",
        "description": "Resume entities for NER training"
    },
    {
        "id": "jillanisofttech/2023-it-professionals-resumes",
        "name": "it_resumes_2023",
        "description": "IT professional resumes"
    },
]


def setup_kaggle_credentials():
    """
    Kaggle credentials setup karo
    ~/.kaggle/kaggle.json mein hona chahiye
    """
    kaggle_dir = Path.home() / ".kaggle"
    kaggle_json = kaggle_dir / "kaggle.json"
    
    if kaggle_json.exists():
        log.info("Kaggle credentials found!")
        return True
    
    log.warning("Kaggle credentials nahi mile!")
    log.info("""
    Kaggle setup karne ke liye:
    1. kaggle.com pe account banao
    2. Account Settings > API > Create New Token
    3. kaggle.json download hoga
    4. Yahan copy karo: ~/.kaggle/kaggle.json
    5. chmod 600 ~/.kaggle/kaggle.json
    """)
    
    # Manual input option
    username = input("Kaggle username dalo (ya Enter skip ke liye): ").strip()
    if username:
        key = input("Kaggle API key dalo: ").strip()
        kaggle_dir.mkdir(exist_ok=True)
        creds = {"username": username, "key": key}
        with open(kaggle_json, "w") as f:
            json.dump(creds, f)
        os.chmod(kaggle_json, 0o600)
        log.info("Kaggle credentials save ho gaye!")
        return True
    
    return False


def download_kaggle_datasets():
    """Saare Kaggle datasets download karo aur JSON mein convert karo"""
    log.info("=== Kaggle Downloader Start ===")
    
    if not setup_kaggle_credentials():
        log.error("Kaggle credentials nahi hain, skip kar raha hoon")
        return []
    
    # kaggle library install check
    try:
        import kaggle
    except ImportError:
        log.info("Kaggle installing...")
        subprocess.run(["pip", "install", "kaggle"], check=True)
        import kaggle
    
    import pandas as pd
    all_data = []
    
    for dataset in KAGGLE_DATASETS:
        log.info(f"Downloading: {dataset['name']} ({dataset['description']})")
        
        download_path = KAGGLE_DIR / dataset["name"]
        download_path.mkdir(exist_ok=True)
        
        try:
            # Download karo
            subprocess.run([
                "kaggle", "datasets", "download",
                "-d", dataset["id"],
                "-p", str(download_path),
                "--unzip"
            ], check=True, capture_output=True)
            
            log.info(f"Downloaded: {dataset['name']}")
            
            # Files ko JSON mein convert karo
            converted = _convert_to_json(download_path, dataset)
            all_data.extend(converted)
            
        except subprocess.CalledProcessError as e:
            log.error(f"Download failed for {dataset['name']}: {e.stderr.decode()}")
            continue
        except Exception as e:
            log.error(f"Error processing {dataset['name']}: {e}")
            continue
    
    # Master JSON
    save_json(all_data, KAGGLE_DIR / "all_kaggle_data.json")
    log.info(f"Kaggle Total Records: {len(all_data)}")
    return all_data


def _convert_to_json(folder_path, dataset_info):
    """CSV/Excel files ko JSON mein convert karo"""
    import pandas as pd
    
    records = []
    folder = Path(folder_path)
    
    # Saari files dhundo
    files = list(folder.rglob("*.csv")) + \
            list(folder.rglob("*.xlsx")) + \
            list(folder.rglob("*.json"))
    
    for file in files:
        try:
            if file.suffix == ".csv":
                df = pd.read_csv(file, encoding="utf-8", errors="replace")
            elif file.suffix == ".xlsx":
                df = pd.read_excel(file)
            elif file.suffix == ".json":
                with open(file) as f:
                    data = json.load(f)
                if isinstance(data, list):
                    records.extend(data)
                else:
                    records.append(data)
                continue
            
            # DataFrame ko clean karo
            df = df.dropna(how="all")
            df.columns = [c.lower().replace(" ", "_") for c in df.columns]
            
            # JSON records
            file_records = df.to_dict(orient="records")
            
            # Metadata add karo
            for rec in file_records:
                rec["_source"] = "kaggle"
                rec["_dataset"] = dataset_info["name"]
                rec["_file"] = file.name
                rec["_scraped_at"] = datetime.now().isoformat()
            
            records.extend(file_records)
            
            # Per-file JSON save karo
            output_file = folder_path / f"{file.stem}_converted.json"
            save_json(file_records, output_file)
            
            log.info(f"Converted {file.name}: {len(file_records)} records")
            
        except Exception as e:
            log.warning(f"Could not convert {file}: {e}")
    
    return records


# ===================== 4. MAIN RUNNER =====================

def run_all_scrapers():
    """Saare scrapers ek saath chalao"""
    
    print("""
    ╔══════════════════════════════════════════╗
    ║   Resume Screening AI - Data Scraper    ║
    ║   Sources: LinkedIn + Indeed + Kaggle   ║
    ╚══════════════════════════════════════════╝
    """)
    
    summary = {
        "started_at": datetime.now().isoformat(),
        "linkedin": 0,
        "indeed": 0,
        "kaggle": 0,
        "total": 0
    }
    
    # 1. Kaggle (pehle yeh karo - stable hai)
    log.info("\n" + "="*50)
    log.info("STEP 1: Kaggle Datasets")
    log.info("="*50)
    try:
        kaggle_data = download_kaggle_datasets()
        summary["kaggle"] = len(kaggle_data)
    except Exception as e:
        log.error(f"Kaggle failed: {e}")
    
    # 2. Indeed
    log.info("\n" + "="*50)
    log.info("STEP 2: Indeed Jobs")
    log.info("="*50)
    try:
        indeed_data = scrape_indeed(max_per_keyword=200)
        summary["indeed"] = len(indeed_data)
    except Exception as e:
        log.error(f"Indeed failed: {e}")
    
    # 3. LinkedIn
    log.info("\n" + "="*50)
    log.info("STEP 3: LinkedIn Jobs")
    log.info("="*50)
    try:
        linkedin_data = scrape_linkedin(max_per_keyword=100)
        summary["linkedin"] = len(linkedin_data)
    except Exception as e:
        log.error(f"LinkedIn failed: {e}")
    
    # Summary
    summary["total"] = summary["linkedin"] + summary["indeed"] + summary["kaggle"]
    summary["completed_at"] = datetime.now().isoformat()
    
    save_json(summary, BASE_DIR / "scraping_summary.json")
    
    print(f"""
    ╔══════════════════════════════════════════╗
    ║              SCRAPING DONE!             ║
    ╠══════════════════════════════════════════╣
    ║  LinkedIn Jobs  : {summary['linkedin']:<6} records         ║
    ║  Indeed Jobs    : {summary['indeed']:<6} records         ║
    ║  Kaggle Data    : {summary['kaggle']:<6} records         ║
    ║  TOTAL          : {summary['total']:<6} records         ║
    ╚══════════════════════════════════════════╝
    
    Data saved in: ./data/
    """)
    
    return summary


if __name__ == "__main__":
    run_all_scrapers()


2026-02-19 09:32:49,613 [INFO] 
2026-02-19 09:32:49,615 [INFO] STEP 1: Kaggle Datasets
2026-02-19 09:32:49,618 [INFO] === Kaggle Downloader Start ===
2026-02-19 09:32:49,623 [INFO] 
    Kaggle setup karne ke liye:
    1. kaggle.com pe account banao
    2. Account Settings > API > Create New Token
    3. kaggle.json download hoga
    4. Yahan copy karo: ~/.kaggle/kaggle.json
    5. chmod 600 ~/.kaggle/kaggle.json
    



    ╔══════════════════════════════════════════╗
    ║   Resume Screening AI - Data Scraper    ║
    ║   Sources: LinkedIn + Indeed + Kaggle   ║
    ╚══════════════════════════════════════════╝
    


2026-02-19 09:32:54,457 [ERROR] Kaggle credentials nahi hain, skip kar raha hoon
2026-02-19 09:32:54,458 [INFO] 
2026-02-19 09:32:54,459 [INFO] STEP 2: Indeed Jobs
2026-02-19 09:32:54,462 [INFO] === Indeed Scraper Start ===
2026-02-19 09:32:54,466 [INFO] Indeed: Searching 'software engineer'
2026-02-19 09:32:55,238 [INFO] Indeed Selenium fallback for 'software engineer'
2026-02-19 09:33:19,215 [INFO] Indeed 'software engineer': 15 jobs
2026-02-19 09:33:19,226 [INFO] Saved: data\indeed\software_engineer.json
2026-02-19 09:33:28,903 [INFO] Indeed: Searching 'data scientist'
2026-02-19 09:33:29,939 [INFO] Indeed Selenium fallback for 'data scientist'
2026-02-19 09:33:46,431 [INFO] Indeed 'data scientist': 15 jobs
2026-02-19 09:33:46,441 [INFO] Saved: data\indeed\data_scientist.json
2026-02-19 09:33:54,108 [INFO] Indeed: Searching 'machine learning'
2026-02-19 09:33:55,468 [INFO] Indeed Selenium fallback for 'machine learning'
2026-02-19 09:34:11,490 [INFO] Indeed 'machine learning': 15 jo


    ╔══════════════════════════════════════════╗
    ║              SCRAPING DONE!             ║
    ╠══════════════════════════════════════════╣
    ║  LinkedIn Jobs  : 0      records         ║
    ║  Indeed Jobs    : 210    records         ║
    ║  Kaggle Data    : 0      records         ║
    ║  TOTAL          : 210    records         ║
    ╚══════════════════════════════════════════╝

    Data saved in: ./data/
    


✓ Kaggle credentials save ho gaye!
  File: C:\Users\HP\.kaggle\kaggle.json
  Username: kalerii02
✓ Kaggle API working hai!
