In [2]:
import requests
import bs4
import re
from urllib.parse import quote_plus

# Lista dei modelli Ferrari
ferrari_cars = [
    'Ferrari Testarossa',
    'Ferrari Daytona GTB',
    'Ferrari 550 Maranello',
    'Ferrari 355 GTB',
    'Ferrari 308 GTB',
    'Ferrari 330 GTC',
    'Ferrari 250 GTE',
    'Ferrari Dino 246',
    'Ferrari F40',
]

# Funzione per scraping di un singolo modello
def scrape_threads(tag_slug, car_model, pages=2):
    url = f'https://www.ferrarichat.com/forum/tags/{tag_slug}/'
    Threads = []

    for i in range(pages):
        page_url = url if i == 0 else f"{url}page-{i+1}"
        resp = requests.get(page_url)
        soup = bs4.BeautifulSoup(resp.text, 'html.parser')

        for a in soup.find_all('li', {'class': 'searchResult'}):
            try:
                title = a.find_all('a')[2].text
                tag = a.find_all('a')[1].text
                link = 'https://www.ferrarichat.com/forum/' + a.find_all('a')[1]['href']
                Threads.append({
                    'tag': tag,
                    'title': title,
                    'link': link,
                    'car_model': car_model,
                    'web': 'forum'  # ID fisso aggiunto
                })
            except IndexError:
                continue

    for i, b in enumerate(Threads[:]):
        messages = []
        link = b['link']

        for j in range(2):  # massimo 2 pagine per thread
            page_url = link if j == 0 else f"{link}/page-{j+1}"
            soup = bs4.BeautifulSoup(requests.get(page_url).text, 'html.parser')
            s = soup.find_all('div', {'class': 'messageContent'})
        
            for m in s:
                mess = m.blockquote.text if m.blockquote else ""
                cleaned = re.sub(r'\s+', ' ', mess)
                cleaned = re.sub(r'Image Unavailable, Please Login', '', cleaned)
                cleaned = cleaned.replace('\xa0', '').strip()
                cleaned = re.sub(r'^.*?said: ↑.*?Click to expand\.\.\.', '', cleaned).strip()
                messages.append(cleaned)

        Threads[i]['messages'] = messages

    return Threads

# Funzione per iterare su tutti i modelli
def scrape_multiple_threads(car_list, pages=2):
    all_threads = []
    for car in car_list:
        slug = quote_plus(car.lower().replace(" ", "-"))
        print(f"🔍 Scraping: {car} (tag: {slug})")
        car_threads = scrape_threads(slug, car, pages)
        all_threads.extend(car_threads)
    return all_threads

# Esempio di uso
all_ferrari_threads = scrape_multiple_threads(ferrari_cars, pages=2)


🔍 Scraping: Ferrari Testarossa (tag: ferrari-testarossa)
🔍 Scraping: Ferrari Daytona GTB (tag: ferrari-daytona-gtb)
🔍 Scraping: Ferrari 550 Maranello (tag: ferrari-550-maranello)
🔍 Scraping: Ferrari 355 GTB (tag: ferrari-355-gtb)
🔍 Scraping: Ferrari 308 GTB (tag: ferrari-308-gtb)
🔍 Scraping: Ferrari 330 GTC (tag: ferrari-330-gtc)
🔍 Scraping: Ferrari 250 GTE (tag: ferrari-250-gte)
🔍 Scraping: Ferrari Dino 246 (tag: ferrari-dino-246)
🔍 Scraping: Ferrari F40 (tag: ferrari-f40)


In [6]:
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Percorso dove salvare il file parquet
file_path = 'notebooks/data/raw/ferrari_threads.parquet'

# Se esiste già il file, lo carichiamo
if os.path.exists(file_path):
    df = pd.read_parquet(file_path)
    print("File esistente caricato:")
    print(df.head())
else:
    # Supponiamo che 'all_ferrari_threads' sia già definito
    df = pd.DataFrame(all_ferrari_threads)

    # Crea directory se non esiste
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    # Salva il DataFrame in formato Parquet
    table = pa.Table.from_pandas(df)
    pq.write_table(table, file_path)
    print(f"Dati salvati in: {file_path}")



Dati salvati in: notebooks/data/raw/ferrari_threads.parquet
