## Settings
Please set the variables according to your use case

In [1]:
# How many pages from https://www.allsides.com/headline-roundups to crawl?
PAGE_START = 1
PAGE_END = 12

## Import of required Modules
Make sure, that the imported moduls are installed.

In [2]:
import os
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.remote.webelement import WebElement

#### set up Selenium and Chrome Driver 
We use selenium with Chrome and tested the scraper with the chromedriver. You need the latest version of the driver from https://chromedriver.chromium.org/. Alternatively, change to the driver to a driver of your preference.
We set up the scraper to run in the background, if you wish to run it in regular window mode, remove the line  "chrome_options.add_argument("--headless")".

In [3]:
# chromedriver setup

serv = Service(r'driver/chromedriver') #path from 'which chromedriver'

# test driver
# for headless chrome mode
chrome_options = Options()

# remove this line if you do not wish to run in background 
# chrome_options.add_argument("--headless") 

### Data classes

In [4]:
from dataclasses import dataclass, field
from typing import Literal, List

@dataclass
class Article:
    source: str
    headline: str
    link: str
    rating_img: str
    rating: Literal["left", "lean left", "center", "lean right", "right"]
    summary: str
    image_link: str
    news_type: str

    def to_json(self):
        return {
            "source": self.source,
            "headline": self.headline,
            "link": self.link,
            "rating_img": self.rating_img,
            "rating": self.rating,
            "summary": self.summary,
            "image_link": self.image_link,
            "news_type": self.news_type,
        }

@dataclass
class Story:
    date: str

    headline: str
    headline_link: str

    topic: str
    topic_link: str
    tags: List[str] = field(default_factory=lambda: [])

    summary: str = ""
    
    left: Article | None = None
    center: Article | None = None
    right: Article | None = None

    more_left: List[Article] = field(default_factory=lambda: [])
    more_center: List[Article] = field(default_factory=lambda: [])
    more_right: List[Article] = field(default_factory=lambda: [])

    def to_json(self):
        return {
            "date": self.date, 
            "headline": self.headline, 
            "headline_link": self.headline_link, 
            "topic": self.topic,
            "topic_link": self.topic_link, 
            "tags": self.tags, 
            "summary": self.summary, 
            "left": self.left.to_json() if self.left else "",
            "center": self.center.to_json() if self.center else "",
            "right": self.right.to_json() if self.right else "",
            "more_left": [entry.to_json() for entry in self.more_left],
            "more_center": [entry.to_json() for entry in self.more_center],
            "more_right": [entry.to_json() for entry in self.more_right],
        }
        

### Retrieval functions

In [5]:
def get_brief_stories_on_page(driver: webdriver.Chrome) -> List[Story]:
    """Get stories from headline_roundup

    Args:
        driver (WebDriver): Webdriver with loaded overview page

    Returns:
        List[Story]: List of stories (without more detailed information (news articles...))
    """
    stories: list[Story] = []

    story_rows = driver.find_elements(By.CSS_SELECTOR, ".views-table tbody tr")

    # loop trough all news stories for the given date (range)
    for row in story_rows:
        cols = row.find_elements(By.CSS_SELECTOR, "td")
        headline = cols[0].find_element(By.CSS_SELECTOR, "a")
        topic = cols[1].find_element(By.CSS_SELECTOR, "a")
        date = cols[2].find_element(By.CSS_SELECTOR, "span").text

        stories.append(Story(
            date,
            headline.text,
            headline.get_attribute("href"),
            topic.text,
            topic.get_attribute("href")
        ))
    return stories
    

def get_article_information(section: WebElement) -> Article:
    title = section.find_element(By.CSS_SELECTOR, ".news-title")
    source = section.find_element(By.CSS_SELECTOR, ".news-source span")

    try: 
        rating_img_link = section.find_element(By.CSS_SELECTOR, ".source-area img").get_attribute("src")
    except:
        rating_img_link = ""

    rating = "unknown"
    if "bias-left" in os.path.basename(rating_img_link):
        rating = "left"
    elif "bias-leaning-left" in os.path.basename(rating_img_link):
        rating = "lean left"
    elif "bias-center" in os.path.basename(rating_img_link):
        rating = "center"
    elif "bias-leaning-right" in os.path.basename(rating_img_link):
        rating = "lean right"
    elif "bias-right" in os.path.basename(rating_img_link):
        rating = "right"
    else: 
        print(f"Rating in <{rating_img_link}> unknown")

    try:
        image = section.find_element(By.CSS_SELECTOR, ".headline-roundup-image img")
        image_link = image.get_attribute("src")
    except:
        image_link = ""
    body = section.find_elements(By.CSS_SELECTOR, ".news-body .body-contents")
    summary = ""
    for paragraph in body:
        summary += f"{paragraph.text}\n"
    summary = summary.strip()

    return Article(
        source=source.text,
        headline=title.text,
        link=title.get_attribute("href"),
        rating_img=rating_img_link,
        rating=rating,
        summary=summary,
        image_link=image_link,
        news_type=""
    )


def get_story_details(driver: webdriver.Chrome, stories: List[Story]):
    """Adds information from story page to each story in stories

    Args:
        driver (WebDriver): _description_
        stories (List[Story]): List of stories without detailed information
    """
    for i, story in enumerate(stories):
        driver.get(story.headline_link)
        print(f"{i}/{len(stories)}: {story.headline_link}")
        wait = WebDriverWait(driver, 5)
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.view-content')))

        tags = driver.find_elements(By.CSS_SELECTOR, ".page-tags a")
        story.tags = [tag.text for tag in tags]

        paragraphs = driver.find_elements(By.CSS_SELECTOR, ".story-id-page-description p")
        for paragraph in paragraphs:
            story.summary += f"{paragraph.text}\n"
        story.summary = story.summary.strip()

        article_sections = driver.find_elements(By.CSS_SELECTOR, ".featured-coverage .news-item")

        for section in article_sections:
            if "left" in section.get_attribute("class"):
                story.left = get_article_information(section)
            if "center" in section.get_attribute("class"):
                story.center = get_article_information(section)
            if "right" in section.get_attribute("class"):
                story.right = get_article_information(section)

        
        more_news = {"left": [], "center": [], "right": []}
        for stance in more_news.keys():
            news_articles = driver.find_elements(By.CSS_SELECTOR, f".news-trio .news-item.{stance}")
            for entry in news_articles:
                article = get_article_information(entry)
                article.link = entry.find_element(By.TAG_NAME, "a").get_attribute('href')
                article.news_type = entry.find_element(By.CSS_SELECTOR, ".news-type-label").text
                more_news[stance].append(article)

        story.more_left = more_news["left"]
        story.more_center = more_news["center"]
        story.more_right = more_news["right"]


### Retrieve Article Links for Crawl

In [6]:
driver = webdriver.Chrome()

url = 'https://www.allsides.com/headline-roundups'
wait = WebDriverWait(driver, 10)

### Get brief story information and links

In [7]:
stories = get_brief_stories_on_page(driver)
for i in range(PAGE_START - 1, PAGE_END):
    driver.get(f"{url}?page={i}")
    wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.views-field a')))
    stories.extend(get_brief_stories_on_page(driver))

### Fill in story information and articles

In [None]:
get_story_details(driver, stories)

0/600: https://www.allsides.com/story/politics-whats-trumps-big-beautiful-bill
1/600: https://www.allsides.com/story/foreign-policy-media-responses-trump-s-jet-qatar


In [None]:
display(stories)

### Saving crawled data as pkl

In [None]:
from datetime import datetime
import pickle

# Define the path to the output folder
output_folder = "crawls"

# Check if the output folder exists
if not os.path.exists(output_folder):
    # If the folder does not exist, create it
    os.makedirs(output_folder)

# Get the current date and time
current_datetime = datetime.now()

# Format the date and time as a string
formatted_datetime = current_datetime.strftime("%Y-%m-%d-%H%M")

# Save crawl as pkl file
with open(os.path.join(output_folder, f'{formatted_datetime}_allsides_crawl.pkl'), 'wb') as f:
    pickle.dump(stories, f)



### Saving crawled data as jsonl

In [None]:
import json

with open(os.path.join(output_folder, f'{formatted_datetime}_allsides_crawl.jsonl'), 'w') as f:
    for story in stories: 
        if not story:
            continue
        json.dump(story.to_json(), f)
        f.write('\n')