In [3]:
# Generate a list of links to 51 OLX pages containing cars
def get_page_urls():
    price_lo = 0
    page_urls = []
    for price_lo in range(1, 59000, 1000):
        for page in range(1, 51):
            page_urls.append(f"https://olx.ba/pretraga?attr=&attr_encoded=1&category_id=18&page={page}&price_to={price_lo + 1000}&price_from={price_lo}")
        for page in range(1, 51):
             page_urls.append(f"https://olx.ba/pretraga?attr=&attr_encoded=1&category_id=18&page={page}&price_from=59000")
    return page_urls

page_urls = get_page_urls()
print(page_urls[:5])

['https://olx.ba/pretraga?attr=&attr_encoded=1&category_id=18&page=1&price_to=1001&price_from=1', 'https://olx.ba/pretraga?attr=&attr_encoded=1&category_id=18&page=2&price_to=1001&price_from=1', 'https://olx.ba/pretraga?attr=&attr_encoded=1&category_id=18&page=3&price_to=1001&price_from=1', 'https://olx.ba/pretraga?attr=&attr_encoded=1&category_id=18&page=4&price_to=1001&price_from=1', 'https://olx.ba/pretraga?attr=&attr_encoded=1&category_id=18&page=5&price_to=1001&price_from=1']


In [4]:
# The number of pages containing cars (40 of them?)
print(len(page_urls))

5900


### WEB SCRAPING

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from tqdm import tqdm

# Boilerplate
firefox_options = webdriver.FirefoxOptions()
firefox_options.add_argument("--headless") 
driver = webdriver.Firefox(options=firefox_options)

# Define the batch number for processing links
batch_num = 2

# Define the file path to save urls to each car-article
article_urls_file_path = 'article_urls.txt'

# Iterate over each link and extract data
for page_url in tqdm(page_urls[100*(batch_num - 1):100*batch_num], total=len(page_urls[100*(batch_num - 1):100*batch_num])):
    page_url = page_url.strip()  # Remove any leading/trailing whitespace
    if not page_url:
        continue  # Skip empty lines

    try:
        # Open the URL
        driver.get(page_url)
        
        # Wait for the page to fully load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        
        # Give some extra time for JavaScript to execute
        time.sleep(2)

        a_tags = driver.find_elements(By.XPATH, "//a[contains(@href, '/artikal/')]")

        # Extract the href attribute and store in a list
        href_urls = [a_tag.get_attribute('href') for a_tag in a_tags]
        with open(article_urls_file_path, 'a') as file:
            for temp in href_urls:
                file.write(temp + "\n")
        
    except Exception as e:
        print(f"Failed to retrieve {page_url}: {e}")

# Quit the WebDriver session
driver.quit()

print("Finished processing all links.")


100%|██████████| 100/100 [16:45<00:00, 10.05s/it]


Finished processing all links.


### CREATING A CSV FILE (EXPORTING CAR DATA)

In [2]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from tqdm import tqdm

# Boilerplate
firefox_options = webdriver.FirefoxOptions()
firefox_options.add_argument("--headless")
driver = webdriver.Firefox(options=firefox_options)

# Define the input file path (of car articles)
input_file_path = "article_urls.txt"

# Define the output CSV file path
output_file_path = 'car_data_new.csv'

# Open the input file and read all lines
with open(input_file_path, 'r') as file:
    urls = file.readlines()

# Define headers for the CSV file
headers = ["URL", "Price", "Transmission", "Year", "Motor Strength (KW)", "Mileage", "Engine Capacity", "Manufacturer", "Model", "Fuel Type"]

# Open the CSV file for writing
with open(output_file_path, 'a', newline='', encoding='utf-8') as csvfile:
    # Create a CSV writer object
    csvwriter = csv.writer(csvfile)
    
    # Write the headers to the CSV file
    csvwriter.writerow(headers)

    # Iterate over each link and extract data
    for url in tqdm(urls, total=len(urls)):
        url = url.strip()  # Remove any leading/trailing whitespace
        if not url:
            continue  # Skip empty lines

        try:
            # Open the URL
            driver.get(url)
            
            # Wait for the page to fully load
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            
            # Give some extra time for JavaScript to execute (adjust if necessary)
            time.sleep(2)

            # Extract the required information
            try:
                # Extract Price
                price_element = driver.find_element(By.CSS_SELECTOR, "span.price-heading.vat")
                price = price_element.text
            except:
                price = "N/A"
            
            try:
                # Extract Transmission
                transmission_element = driver.find_element(By.XPATH, "//td[text()='Transmisija']/following-sibling::td")
                transmission = transmission_element.text.strip()
            except:
                transmission = "N/A"
            
            try:
                # Extract Year
                year_element = driver.find_element(By.XPATH, "//td[text()='Godište']/following-sibling::td")
                year = year_element.text.strip()
            except:
                year = "N/A"
            
            try:
                # Extract Motor Strength
                motor_strength_element = driver.find_element(By.XPATH, "//td[text()='Snaga motora (KW)']/following-sibling::td")
                motor_strength = motor_strength_element.text.strip()
            except:
                motor_strength = "N/A"

            try:
                # Extract Mileage
                mileage_element = driver.find_element(By.XPATH, "//td[text()='Kilometraža']/following-sibling::td")
                mileage = mileage_element.text.strip()
            except:
                mileage = "N/A"

            try:
                # Extract Engine Capacity
                engine_capacity_element = driver.find_element(By.XPATH, "//td[text()='Kubikaža']/following-sibling::td")
                engine_capacity = engine_capacity_element.text.strip()
            except:
                engine_capacity = "N/A"

            try:
                # Extract Manufacturer
                manufacturer_element = driver.find_element(By.XPATH, "//td[text()='Proizvođač']/following-sibling::td/a")
                manufacturer = manufacturer_element.text.strip()
            except:
                manufacturer = "N/A"

            try:
                # Extract Model
                model_element = driver.find_element(By.XPATH, "//td[text()='Model']/following-sibling::td/a")
                model = model_element.text.strip()
            except:
                model = "N/A"

            try:
                # Extract Fuel Type
                fuel_type_element = driver.find_element(By.XPATH, "//td[text()='Gorivo']/following-sibling::td")
                fuel_type = fuel_type_element.text.strip()
            except:
                fuel_type = "N/A"

            if fuel_type == "N/A" or model == "N/A" or manufacturer == "N/A" or price == "N/A" or transmission == "N/A" or year == "N/A" or motor_strength == "N/A" or mileage == "N/A" or engine_capacity == "N/A":
                continue
            
            # Write the extracted information to the CSV file
            row = [url, price, transmission, year, motor_strength, mileage, engine_capacity, manufacturer, model, fuel_type]
            csvwriter.writerow(row)

        except Exception as e:
            print(f"Failed to retrieve {url}: {e}")

# Quit the WebDriver session
driver.quit()

print("Finished processing all links.")


 17%|█▋        | 463/2722 [58:59<251:45:23, 401.21s/it]

Failed to retrieve https://olx.ba/artikal/61737772: Message: Navigation timed out after 300000 ms
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
TimeoutError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:740:5
bail@chrome://remote/content/marionette/sync.sys.mjs:211:19



 19%|█▉        | 523/2722 [1:03:58<4:28:59,  7.34s/it] 


KeyboardInterrupt: 