In [1530]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import time

# Initialize an empty DataFrame to accumulate all data
data_frame = pd.DataFrame()

# Starting page and increment
START_PAGE = 1
PAGE_INCREMENT = 30
MAX_PAGES = 20  # Adjust this to scrape more pages (e.g., 5 for page=1, 30, 60, 90, 120...)

for page_num in range(MAX_PAGES):
    titles = []
    urls = []
    descriptions = []
    locations = []

    # Calculate the page value (1, 30, 60, etc.)
    page_value = START_PAGE + (page_num * PAGE_INCREMENT)
    if page_num == 0:  # Ensure first page is page=1, not page=0
        page_value = START_PAGE

    HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:137.0) Gecko/20100101 Firefox/137.0"}
    
    # Use the page value in the URL
    search_url = f"https://www.helloworld.rs/oglasi-za-posao?page={page_value}&disable_saved_search=1"
    print(f"Scraping from {search_url}")

    # Fetch the page
    response = requests.get(search_url, headers=HEADERS)
    
    if response.status_code != 200:
        print(f"Failed to fetch page {page_value} with status code: {response.status_code}")
        continue
    
    soup = bs(response.text, 'html.parser')

    # Check if jobs container exists
    get_info = soup.find("div", class_="max-w-5xl mx-auto flex flex-col gap-4 mb-8 __search-results")
    if not get_info:
        print(f"No job container found on page {page_value}")
        continue

    # Get job titles, URLs, location and description
    get_job_titles = get_info.find_all('h3')
    get_job_urls = get_info.find_all('a', class_="__ga4_job_title hover:opacity-50 font-bold text-lg")
    get_job_location = get_info.find_all('div', class_="flex flex-col gap-4 flex-1 px-4 md:pl-4 mb-4 w-full")
    get_job_des = get_info.find_all('div', class_="flex flex-col gap-4 flex-1 px-4 md:pl-4 mb-4 w-full")

    min_length = min(len(get_job_titles), len(get_job_urls), len(get_job_des))
    print(f"Found {min_length} jobs on page {page_value}")

    for i in range(min_length):
        # Title
        try:
            title = get_job_titles[i].text.strip()
        except:
            title = ""
        titles.append(title)

        # URL
        try:
            url = "https://helloworld.rs" + get_job_urls[i]['href']
        except:
            url = ""
        urls.append(url)

        # Location
        try:
            location = get_job_location[i]('p')[0]
        except:
            locations = ""
        locations.append(locations)

        # Description
        try:
            des = get_job_des[i]('p')[1].text
        except:
            des = ""
        descriptions.append(des)
        

    # Debugging: Print a sample to verify uniqueness
    if titles:
        print(f"Sample title from page {page_value}: {titles[0]}")

    # Create a temporary DataFrame for this page
    temp_df = pd.DataFrame({
        "titles": titles,
        "urls": urls,
        "location": location,
        "descriptions": descriptions
    })
    
    # Append to main DataFrame
    data_frame = pd.concat([data_frame, temp_df], ignore_index=True)
    print(f"Total jobs accumulated so far: {len(data_frame)}")

    # Add a small delay to avoid overwhelming the server
    time.sleep(1)

# Save all accumulated data to CSV
data_frame.to_csv('HelloWorld.csv', index=False, encoding='utf-8')
print('Excel done')
print(f"Total unique jobs scraped: {len(data_frame.drop_duplicates())}")


Scraping from https://www.helloworld.rs/oglasi-za-posao?page=1&disable_saved_search=1
Found 30 jobs on page 1
Sample title from page 1: Šef IT sektora Polovnih Automobila (Head of Software Development)
Total jobs accumulated so far: 30
Scraping from https://www.helloworld.rs/oglasi-za-posao?page=31&disable_saved_search=1
Found 30 jobs on page 31
Sample title from page 31: Microsoft 365 Analyst
Total jobs accumulated so far: 60
Scraping from https://www.helloworld.rs/oglasi-za-posao?page=61&disable_saved_search=1
Found 30 jobs on page 61
Sample title from page 61: SAP Consultant HCM
Total jobs accumulated so far: 90
Scraping from https://www.helloworld.rs/oglasi-za-posao?page=91&disable_saved_search=1
Found 30 jobs on page 91
Sample title from page 91: Senior Software Developer
Total jobs accumulated so far: 120
Scraping from https://www.helloworld.rs/oglasi-za-posao?page=121&disable_saved_search=1
Found 30 jobs on page 121
Sample title from page 121: Cloud Infrastructure Administrator
