# Install a browser (Firefox) and a compatible driver


In [None]:
!wget -O firefox.tar.xz "https://download.mozilla.org/?product=firefox-latest&os=linux64&lang=en-US"
!tar -xf firefox.tar.xz
!mv firefox /usr/local/firefox
!ln -s /usr/local/firefox/firefox /usr/bin/firefox

In [None]:
!wget -q "https://github.com/mozilla/geckodriver/releases/latest/download/geckodriver-v0.35.0-linux64.tar.gz"
!tar -xvf geckodriver-v0.35.0-linux64.tar.gz
!chmod +x geckodriver
!mv geckodriver /usr/bin/geckodriver

# Install selenium web driver

In [None]:
!pip install selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options

# Configura Firefox in modalità headless
firefox_options = Options()
firefox_options.add_argument("--headless")  # Necessario per Colab
firefox_options.add_argument("--no-sandbox")
firefox_options.add_argument("--disable-dev-shm-usage")
firefox_options.binary_location = "/usr/bin/firefox"

# Imposta il percorso di Geckodriver
service = Service("/usr/bin/geckodriver")

# Avvia Firefox con Selenium
driver = webdriver.Firefox(service=service, options=firefox_options)

# Import necessary libraries

In [None]:
from bs4 import BeautifulSoup
import regex as re
import pandas as pd
from tqdm import tqdm

# Get base url according to user's criteria

In [None]:
prefix = 'https://www.autoscout24.it'
base_url = "https://www.autoscout24.it/lst?atype=C&cy=I&desc=0&doorfrom=4&doorto=5&eq=5&fregfrom=2010&gear=M&kmto=150000&lat=45.46362&lon=9.18812&powertype=kw&priceto=6000&search_id=114j0wszvq6&sort=standard&source=detailsearch&ustate=N%2CU&zip=milano&zipr=30"

In [2]:
def send_request(url, driver):
  driver.get(url)
  soup = BeautifulSoup(driver.page_source, "html.parser")

  return soup

# Start scraping offers

In [None]:
url = base_url + "&page=1"
print(url)
soup = send_request(url, driver)

https://www.autoscout24.it/lst?atype=C&cy=I&desc=0&doorfrom=4&doorto=5&eq=5&fregfrom=2010&gear=M&kmto=150000&lat=45.46362&lon=9.18812&powertype=kw&priceto=6000&search_id=114j0wszvq6&sort=standard&source=detailsearch&ustate=N%2CU&zip=milano&zipr=30&page=1


In [None]:
# get number of pages

list_item = soup.find("li", class_ = 'pagination-item--disabled pagination-item--page-indicator')
num_pages = int(list_item.find('span').text.split("/")[-1].strip())

print(num_pages)

16


In [None]:
# iterate over offers

offers = []
page_num = 1

for page in tqdm(range(1,num_pages+1)):

  print(f"Start of page: {page}\n\n")

  # get all offers in the current page
  main = soup.find("main", class_ = "ListPage_main___0g2X")
  current_offers = main.find_all("article")
  for offer in current_offers:

    # get all relevant data
    make = offer['data-make']
    model = offer['data-model']
    price = offer['data-price']
    mileage = offer['data-mileage']
    first_registration = offer['data-first-registration']

    # data processing
    make_model = make + ' ' + model
    price = int(price)
    mileage = int(mileage)
    month_year = first_registration.split("-")
    month = month_year[0]
    year = month_year[1]

    link = prefix + offer.find('a', class_ = 'ListItem_title__ndA4s ListItem_title_new_design__QIU2b Link_link__Ajn7I')['href']

    offer_dict = {
        'Model': make_model,
        'Price': price,
        'Mileage': mileage,
        'Month': month,
        'Year': year,
        'Link': link
    }

    # save the offer
    offers.append(offer_dict)

    # print the name of the car related to the offer
    print(offer_dict['Model'])

  # go to the next page
  print(f"\n\nEnd of page:{page}\n----------------------------------- ")

  url = base_url + f"&page={page+1}"

  soup = send_request(url, driver)

driver.quit()

# Save results



In [None]:
# create and save dataframe as excel file

df = pd.DataFrame(offers)
df.to_excel('used_cars.xlsx', index=False)

In [None]:
df_sorted = df.sort_values(
    by=['Mileage', 'Year', 'Month', 'Price'],
    ascending=[True, False, False, True]
)

df_sorted.head()

In [None]:
df_sorted.to_excel('used_cars_sorted.xlsx', index=False)