# Data Gathering - Forums

In [1]:
import pandas as pd
from datetime import datetime
import requests.auth as auth
import requests
from bs4 import BeautifulSoup
import time
import re
import random

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException

# 500e - SpeakEV Forum

In [2]:
# Function to retrieve thread links from the current page
def get_thread_links(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser") # Parse the HTML of the page
    speakev_links = soup.find_all("h3", class_="contentRow-title")
    speakev_urls = [link.find('a')['href'] for link in speakev_links]
    return speakev_urls

In [3]:
# Function to scrape content and timestamp from a thread page
def scrape_thread_content(driver, url):
    driver.get(url)
    time.sleep(3)  # Wait for the page to load completely
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Initialise title variable
    title = "N/A"

    # First check for the MessageCard__thread-title structure
    title_wrapper = soup.find("div", class_="MessageCard__title-wrapper")
    if title_wrapper:
        title_tag = title_wrapper.find("h1", class_="MessageCard__thread-title")
        if title_tag:
            title = title_tag.text.strip()
    
    # If not found, check for the header-content-wrapper structure
    if title == "N/A":
        header_content_wrapper = soup.find("div", class_="header-content-wrapper")
        if header_content_wrapper:
            title_tag = header_content_wrapper.find("h1", class_="MessageCard__thread-title")
            if title_tag:
                title = title_tag.text.strip()

    # Scrape timestamp
    timestamp_tag = soup.find("time", class_="u-dt")
    timestamp = timestamp_tag["datetime"] if timestamp_tag else "N/A"
   
    # Scrape content
    content_div = soup.find("div", class_="bbWrapper")
    content = content_div.text.strip() if content_div else "Content not found"

    return title, timestamp, content

In [4]:
# Initialise the WebDriver
driver = webdriver.Chrome()
url = "https://www.speakev.com/search/1090946/?q=fiat+500e+-abarth&c[searchProfileName]=control&o=relevance"
driver.get(url)

# Accept cookies
cookie = driver.find_element(By.XPATH, '//*[@id="qc-cmp2-ui"]/div[2]/div/button[2]')
cookie.click()

In [5]:
# Initialise a list to hold all the URLs
all_speakev_urls = []

# Loop through all the pages (SpeakEV has 22 pages)
for page_num in range(1, 23):  # Adjust the range based on the number of pages
    # Get links from the current page
    speakev_urls = get_thread_links(driver)
    all_speakev_urls.extend(speakev_urls)
    
    # Click on the next page button if not on the last page
    if page_num < 22:
        next_page = driver.find_element(By.XPATH, '//*[@id="js-XFUniqueId67"]')
        next_page.click()
        
        # Wait for the new page to load
        time.sleep(5)  # Adjust as needed

In [6]:
# Scrape content and timestamps from each thread
speakev_posts = []
base_url = "https://www.speakev.com"

for url in all_speakev_urls:
    full_url = f"{base_url}{url}"
    title, timestamp, content = scrape_thread_content(driver, full_url)
    speakev_posts.append({'URL': full_url, 'Timestamp': timestamp, 'Title': title, 'Content': content})
    print(f"Scraped content from: {full_url}")

# Close the browser
driver.quit()

Scraped content from: https://www.speakev.com/forums/EV-Classifieds/?listing_type=sell
Scraped content from: https://www.speakev.com/threads/fiat-500e-icon-charging-cable.178951/post-3470630
Scraped content from: https://www.speakev.com/threads/what-charging-cables-come-with-the-fiat-500e-icon.166541/post-3177426
Scraped content from: https://www.speakev.com/threads/fiat-500e-service-in-seattle-eastside.173077/post-3331356
Scraped content from: https://www.speakev.com/threads/thoughts-on-the-fiat-500e-designio-by-kahn.179877/post-3492623
Scraped content from: https://www.speakev.com/threads/fiat-500e-charging-issues.180301/post-3501970
Scraped content from: https://www.speakev.com/threads/first-car-fiat-500e-icon-2022.179501/post-3483811
Scraped content from: https://www.speakev.com/threads/fiat-500e-easywallbox-charger-installation.161038/post-3056369
Scraped content from: https://www.speakev.com/threads/fiat-500e-for-sale-on-uk-plates.164024/post-3121840
Scraped content from: https:/

In [7]:
# Save to CSV using pandas
speakev_500 = pd.DataFrame(speakev_posts)
speakev_500.to_csv("500e - SpeakEV_20240724.csv", index=False)

# 500e - PistonHeads Forum

In [19]:
# Function to retrieve thread links from the current page
def get_thread_links(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")  # Parse the HTML of the page
    pistonheads_links = soup.find_all("a", class_="gs-title")
    pistonheads_urls = []

    for link in pistonheads_links:
        href = link.get('data-ctorig')
        if href and 'pistonheads.com' in href:
            pistonheads_urls.append(href)
    
    # Skip the first 4 links (sponsored posts)
    pistonheads_urls = pistonheads_urls[4:]
    
    return pistonheads_urls

In [20]:
# Function to scrape all posts from a thread page
def scrape_thread_content(driver, url):
    driver.get(url)
    time.sleep(3)  # Wait for the page to load completely
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Scrape forum title
    title_tag = soup.find("div", class_="title")
    title = title_tag.text.strip() if title_tag else "N/A"

    posts = []
    
    # Find all message blocks
    message_blocks = soup.find_all("div", class_="topic-reply")
    
    for message_block in message_blocks:
        # Scrape timestamp
        timestamp_tag = message_block.find("span", class_="timestamp")
        timestamp = timestamp_tag.text.strip() if timestamp_tag else "N/A"
        
        # Scrape content
        content_div = message_block.find("div", class_="phml msg-body")
        content = content_div.text.strip() if content_div else "Content not found"
        
        posts.append({'Date': timestamp, 'URL': url, 'Title': title, 'Comment': content})
    
    return posts

In [22]:
# Initialise the WebDriver
driver = webdriver.Chrome()
url = "https://www.pistonheads.com/search#gsc.tab=0&gsc.q=fiat%20500e%20-abarth&gsc.sort=&gsc.page=1"
driver.get(url)

# Wait for the cookie acceptance button and click it
cookie_button = WebDriverWait(driver, 5).until(
    EC.element_to_be_clickable((By.XPATH, '//*[@id="qc-cmp2-ui"]/div[2]/div/button[3]'))
)
cookie_button.click()

In [23]:
# Initialise a set to hold all the unique URLs
all_pistonheads_urls = set()

# Loop through all the pages (PistonHeads has 9 pages)
for page_num in range(1, 10):  # Adjust the range based on the number of pages
    # Get links from the current page
    pistonheads_urls = get_thread_links(driver)
    all_pistonheads_urls.update(pistonheads_urls)
    
    # Click on the page number button if not on the last page
    if page_num < 9:
        next_page = driver.find_element(By.XPATH, f'//div[@class="gsc-cursor-page" and @aria-label="Page {page_num + 1}"]')
        next_page.click()
            
        # Wait for the new page to load
        time.sleep(5)  # Adjust as needed

In [24]:
# Scrape content from each unique URL
pistonhead_posts = []

for url in all_pistonheads_urls:
    posts = scrape_thread_content(driver, url)
    pistonhead_posts.extend(posts)
    print(f"Scraped content from: {url}")

    # Introduce a delay to avoid hitting rate limits
    time.sleep(random.uniform(5, 10))

# Close the browser
driver.quit()

Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=23&t=2031582&i=20
Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=247&t=2033933
Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=247&t=1964461
Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=108&t=1062784
Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=47&t=2088752
Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=23&t=2010896&i=20
Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=23&t=2065676&i=20
Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=247&t=2043934
Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=255&t=2017918
Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=255&t=2038275&i=20
Scraped content from: https://www.pistonheads.com/gassing/topic.asp?h=0&f=247&t=1857926
Scraped content 

In [25]:
# Save to CSV using pandas
pistonheads_500 = pd.DataFrame(pistonhead_posts)
pistonheads_500.to_csv("500e - PistonHeads_20240724.csv", index=False)