This cell contains the steps required to perform web browser operations using Selenium in the Google Colab environment.

In [None]:
%pip install -q google-colab-selenium
import google_colab_selenium as gs
driver = gs.Chrome()

!pip install selenium

This cell collects topics and related data from a research category in the "Hugging Face" forum using Selenium and Python. The data is saved in CSV, JSON, and TXT formats.

In [None]:
import csv
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1000")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--incognito")

driver = webdriver.Chrome(options=chrome_options)

url = "https://discuss.huggingface.co/c/research/7/l/top"
driver.get(url)

# Scroll the page to load more topics
SCROLL_PAUSE_TIME = 2
MAX_SCROLLS = 5
last_height = driver.execute_script("return document.body.scrollHeight")

for _ in range(MAX_SCROLLS):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(SCROLL_PAUSE_TIME)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Extract topic data
topics = driver.find_elements(By.CLASS_NAME, "title.raw-link.raw-topic-link")
topic_owners = driver.find_elements(By.XPATH, "//td[@class='posters topic-list-data']/a[img[contains(@title, 'Original Poster')]]")
replies = driver.find_elements(By.XPATH, "//td[contains(@class, 'num posts-map posts')]//span[@class='number']")
views = driver.find_elements(By.XPATH, "//td[contains(@class, 'num views')]//span[@class='number']")
activity = driver.find_elements(By.XPATH, "//td[contains(@class, 'num topic-list-data')] ")

data = []
for i in range(len(topics)):
    topic_title = topics[i].text
    topic_link = topics[i].get_attribute("href")
    topic_owner = topic_owners[i].get_attribute("data-user-card") if i < len(topic_owners) else "Unknown"
    topic_replies = replies[i].text if i < len(replies) else "Unknown"
    topic_views = views[i].text if i < len(views) else "Unknown"
    topic_activity = activity[i].get_attribute("title") if i < len(activity) else "Unknown"

    data.append({
        "Topic": topic_title,
        "Link": topic_link,
        "Owner": topic_owner,
        "Reply Count": topic_replies,
        "View Count": topic_views,
        "Activity Date": topic_activity
    })

# Save to CSV file
with open("topics.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Topic", "Link", "Owner", "Reply Count", "View Count", "Activity Date"])
    for item in data:
        writer.writerow([item["Topic"], item["Link"], item["Owner"], item["Reply Count"], item["View Count"], item["Activity Date"]])

# Save to JSON file
with open("topics.json", "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

# Save to TXT file
with open("topics.txt", "w", encoding="utf-8") as file:
    for i, item in enumerate(data, start=1):
        file.write(f"{i}. Topic: {item['Topic']}\n")
        file.write(f"   Link: {item['Link']}\n")
        file.write(f"   Owner: {item['Owner']}\n")
        file.write(f"   Reply Count: {item['Reply Count']}\n")
        file.write(f"   View Count: {item['View Count']}\n")
        file.write(f"   Activity Date: {item['Activity Date']}\n")
        file.write("-" * 50 + "\n")

print("Data has been saved to 'topics.csv', 'topics.json', and 'topics.txt'.")
driver.quit()
