## Import of required Modules
Make sure, that the imported moduls are installed.

In [1]:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.remote.webelement import WebElement

#### set up Selenium and Chrome Driver 
We use selenium with Chrome and tested the scraper with the chromedriver. You need the latest version of the driver from https://chromedriver.chromium.org/. Alternatively, change to the driver to a driver of your preference.
We set up the scraper to run in the background, if you wish to run it in regular window mode, remove the line  "chrome_options.add_argument("--headless")".

In [2]:
# chromedriver setup

serv = Service(r'driver/chromedriver') #path from 'which chromedriver'

# test driver
# for headless chrome mode
chrome_options = Options()

# remove this line if you do not wish to run in background 
# chrome_options.add_argument("--headless") 

### Data classes

In [3]:
from dataclasses import dataclass, field
from typing import Literal, List

@dataclass
class Article:
    source: str
    headline: str
    link: str

    rating: Literal["left", "lean left", "center", "lean right", "right"]
    summary: str
    image_link: str

@dataclass
class Story:
    date: str

    headline: str
    headline_link: str

    topic: str
    topic_link: str

    summary: str = ""
    tags: List[str] = field(default_factory=lambda: [])

    left: Article | None = None
    center: Article | None = None
    right: Article | None = None

### Retrieval functions

In [4]:
def get_brief_stories_on_page(driver: webdriver.Chrome) -> List[Story]:
    """Get stories from headline_roundup

    Args:
        driver (WebDriver): Webdriver with loaded overview page

    Returns:
        List[Story]: List of stories (without more detailed information (news articles...))
    """
    stories: list[Story] = []

    story_rows = driver.find_elements(By.CSS_SELECTOR, ".views-table tbody tr")

    # loop trough all news stories for the given date (range)
    for row in story_rows:
        cols = row.find_elements(By.CSS_SELECTOR, "td")
        headline = cols[0].find_element(By.CSS_SELECTOR, "a")
        topic = cols[1].find_element(By.CSS_SELECTOR, "a")
        date = cols[2].find_element(By.CSS_SELECTOR, "span").text

        stories.append(Story(
            date,
            headline.text,
            headline.get_attribute("href"),
            topic.text,
            topic.get_attribute("href")
        ))
    return stories
    

def get_article_information(section: WebElement) -> Article:
    title = section.find_element(By.CSS_SELECTOR, ".news-title")
    source = section.find_element(By.CSS_SELECTOR, ".news-source span")
    raw_rating = section.find_element(By.CSS_SELECTOR, ".source-area img").get_attribute("title")

    ratings = {
        "AllSides Media Bias Rating: Left": "left", 
        "AllSides Media Bias Rating: Lean Left": "lean left", 
        "AllSides Media Bias Rating: Center": "center", 
        "AllSides Media Bias Rating: Lean Right": "lean right", 
        "AllSides Media Bias Rating: Right": "right",
        "AllSides Media Bias Rating: Mixed": "mixed",
        "": "right", # TODO check if always correct?
    }
    if not raw_rating:  
        print("No rating title: ", source.text)
        # CHECK: it seems that this is always happening for the Right rating
        # Manual assignment below
        raw_rating = "AllSides Media Bias Rating: Right"
    if "Mixed" in raw_rating:
        print(source.text)

    try:
        image = section.find_element(By.CSS_SELECTOR, ".headline-roundup-image img")
        image_link = image.get_attribute("src")
    except:
        image_link = ""
    body = section.find_elements(By.CSS_SELECTOR, ".news-body .body-contents")
    summary = ""
    for paragraph in body:
        summary += f"{paragraph.text}\n"
    summary = summary.strip()

    return Article(
        source.text,
        title.text,
        title.get_attribute("href"),
        ratings[raw_rating],
        summary,
        image_link
    )


def get_story_details(driver: webdriver.Chrome, stories: List[Story]):
    """Adds information from story page to each story in stories

    Args:
        driver (WebDriver): _description_
        stories (List[Story]): List of stories without detailed information
    """
    for story in stories:
        driver.get(story.headline_link)
        wait = WebDriverWait(driver, 7)
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.view-content')))

        tags = driver.find_elements(By.CSS_SELECTOR, ".page-tags a")
        story.tags = [tag.text for tag in tags]

        paragraphs = driver.find_elements(By.CSS_SELECTOR, ".story-id-page-description p")
        for paragraph in paragraphs:
            story.summary += f"{paragraph.text}\n"
        story.summary = story.summary.strip()

        article_sections = driver.find_elements(By.CSS_SELECTOR, ".featured-coverage .news-item")

        for section in article_sections:
            if "left" in section.get_attribute("class"):
                story.left = get_article_information(section)
            if "center" in section.get_attribute("class"):
                story.center = get_article_information(section)
            if "right" in section.get_attribute("class"):
                story.right = get_article_information(section)


### Retrieve Article Links for Crawl

In [5]:
driver = webdriver.Chrome()

url = 'https://www.allsides.com/headline-roundups'
wait = WebDriverWait(driver, 10)

### Get brief story information and links

In [6]:
stories = get_brief_stories_on_page(driver)
NUM_PAGES = 1 # TODO adjust for usecase
for i in range(1, NUM_PAGES + 1):
    driver.get(f"{url}?page={i}")
    wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.views-field a')))
    stories.extend(get_brief_stories_on_page(driver))

# display(stories)

### Fill in story information and articles

In [7]:
get_story_details(driver, stories)
# display(stories)


No rating title:  Fox News Digital
AllSides Media Bias Rating: Left
AllSides Media Bias Rating: Center
AllSides Media Bias Rating: Lean Left

No rating title:  The Daily Caller

No rating title:  Townhall
AllSides Media Bias Rating: Left

No rating title:  Daily Mail
AllSides Media Bias Rating: Lean Left
AllSides Media Bias Rating: Center
AllSides Media Bias Rating: Left
AllSides Media Bias Rating: Center
AllSides Media Bias Rating: Lean Right
AllSides Media Bias Rating: Lean Left
AllSides Media Bias Rating: Center
AllSides Media Bias Rating: Lean Right
AllSides Media Bias Rating: Lean Right
AllSides Media Bias Rating: Center
AllSides Media Bias Rating: Lean Left
AllSides Media Bias Rating: Lean Right
AllSides Media Bias Rating: Center
AllSides Media Bias Rating: Left
AllSides Media Bias Rating: Lean Left
AllSides Media Bias Rating: Center

No rating title:  Fox News Digital
AllSides Media Bias Rating: Center
AllSides Media Bias Rating: Lean Right
AllSides Media Bias Rating: Lean Left

### Saving crawl

In [8]:
import pickle

with open('allsides_crawl.pkl', 'wb') as f:
    pickle.dump(stories, f)

In [9]:
stories

[Story(date='2025-04-07', headline='Supreme Court Pauses Deadline to Return Kilmar Abrego Garcia to US', headline_link='https://www.allsides.com/story/politics-ahead-key-deadline-debate-continues-over-kilmar-abrego-garcias-deportation', topic='Politics', topic_link='https://www.allsides.com/topics/politics', summary='The Supreme Court on Monday paused a deadline requiring the Trump administration to return recently-deported Kilmar Abrego Garcia to the US from El Salvador.\nThe Details: On Friday, U.S. Federal District Judge Paula Xinis said that the Salvadoran national had been illegally deported to El Salvador and that he must be returned to the U.S. by midnight Monday. Xinis found that there was little to no evidence that Abrego Garcia was once an MS-13 gang member. Attorney General Pam Bondi said that ICE members have testified that Abrego Garcia, a former Maryland resident, is a member of the criminal gang.\nKey Quotes: “As defendants acknowledge, they had no legal authority to arr

In [19]:
import json

with open('allsides_crawl.jsonl', 'w') as f:
    for story in stories: 
        entry = {
            "date": story.date, 
            "headline": story.headline, 
            "headline_link": story.headline_link, 
            "topic": story.topic, 
            "topic_link": story.topic_link, 
            "summary": story.summary,
            "article_left": {},
            "article_center": {},
            "article_right": {},
        }

        if story.left:
            article = story.left
            entry["article_left"] = {
                "headline": article.headline, 
                "link": article.link, 
                "rating": str(article.rating), 
                "summary": article.summary, 
                "image_link": article.image_link, 
            }

        if story.center:
            article = story.center
            entry["article_center"] = {
                "headline": article.headline, 
                "link": article.link, 
                "rating": str(article.rating), 
                "summary": article.summary, 
                "image_link": article.image_link, 
            }
        if story.right:
            article = story.right
            entry["article_right"] = {
                "headline": article.headline, 
                "link": article.link, 
                "rating": str(article.rating), 
                "summary": article.summary, 
                "image_link": article.image_link, 
            }

        json.dump(entry, f)
        f.write('\n')
        



NameError: name 'outfile' is not defined