## Import of required Modules
Make sure, that the imported moduls are installed.

In [None]:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.remote.webelement import WebElement

#### set up Selenium and Chrome Driver 
We use selenium with Chrome and tested the scraper with the chromedriver. You need the latest version of the driver from https://chromedriver.chromium.org/. Alternatively, change to the driver to a driver of your preference.
We set up the scraper to run in the background, if you wish to run it in regular window mode, remove the line  "chrome_options.add_argument("--headless")".

In [None]:
# chromedriver setup

serv = Service(r'driver/chromedriver') #path from 'which chromedriver'

# test driver
# for headless chrome mode
chrome_options = Options()

# remove this line if you do not wish to run in background 
# chrome_options.add_argument("--headless") 

### Data classes

In [None]:
from dataclasses import dataclass, field
from typing import Literal, List

@dataclass
class Article:
    source: str
    headline: str
    link: str

    rating: Literal["left", "lean left", "center", "lean right", "right"]
    summary: str
    image_link: str

@dataclass
class Story:
    date: str

    headline: str
    headline_link: str

    topic: str
    topic_link: str

    summary: str = ""
    tags: List[str] = field(default_factory=lambda: [])

    left: Article | None = None
    center: Article | None = None
    right: Article | None = None


### Retrieval functions

In [None]:
def get_brief_stories_on_page(driver: webdriver.Chrome) -> List[Story]:
    """Get stories from headline_roundup

    Args:
        driver (WebDriver): Webdriver with loaded overview page

    Returns:
        List[Story]: List of stories (without more detailed information (news articles...))
    """
    stories: list[Story] = []

    story_rows = driver.find_elements(By.CSS_SELECTOR, ".views-table tbody tr")

    # loop trough all news stories for the given date (range)
    for row in story_rows:
        cols = row.find_elements(By.CSS_SELECTOR, "td")
        headline = cols[0].find_element(By.CSS_SELECTOR, "a")
        topic = cols[1].find_element(By.CSS_SELECTOR, "a")
        date = cols[2].find_element(By.CSS_SELECTOR, "span").text

        stories.append(Story(
            date,
            headline.text,
            headline.get_attribute("href"),
            topic.text,
            topic.get_attribute("href")
        ))
    return stories
    

def get_article_information(section: WebElement) -> Article:
    title = section.find_element(By.CSS_SELECTOR, ".news-title")
    source = section.find_element(By.CSS_SELECTOR, ".news-source span")
    raw_rating = section.find_element(By.CSS_SELECTOR, ".source-area img").get_attribute("title")

    ratings = {
        "AllSides Media Bias Rating: Left": "left", 
        "AllSides Media Bias Rating: Lean Left": "lean left", 
        "AllSides Media Bias Rating: Center": "center", 
        "AllSides Media Bias Rating: Lean Right": "lean right", 
        "AllSides Media Bias Rating: Right": "right",
        "AllSides Media Bias Rating: Mixed": "mixed",
        "": "right", # TODO check if always correct?
    }
    if not raw_rating:
        print("No rating title: ", source.text)
    if "Mixed" in raw_rating:
        print(source.text)

    try:
        image = section.find_element(By.CSS_SELECTOR, ".headline-roundup-image img")
        image_link = image.get_attribute("src")
    except:
        image_link = ""
    body = section.find_elements(By.CSS_SELECTOR, ".news-body .body-contents")
    summary = ""
    for paragraph in body:
        summary += f"{paragraph.text}\n"
    summary = summary.strip()

    return Article(
        source.text,
        title.text,
        title.get_attribute("href"),
        ratings[raw_rating],
        summary,
        image_link
    )


def get_story_details(driver: webdriver.Chrome, stories: List[Story]):
    """Adds information from story page to each story in stories

    Args:
        driver (WebDriver): _description_
        stories (List[Story]): List of stories without detailed information
    """
    for story in stories:
        driver.get(story.headline_link)
        wait = WebDriverWait(driver, 7)
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.view-content')))

        tags = driver.find_elements(By.CSS_SELECTOR, ".page-tags a")
        story.tags = [tag.text for tag in tags]

        paragraphs = driver.find_elements(By.CSS_SELECTOR, ".story-id-page-description p")
        for paragraph in paragraphs:
            story.summary += f"{paragraph.text}\n"
        story.summary = story.summary.strip()

        article_sections = driver.find_elements(By.CSS_SELECTOR, ".featured-coverage .news-item")

        for section in article_sections:
            if "left" in section.get_attribute("class"):
                story.left = get_article_information(section)
            if "center" in section.get_attribute("class"):
                story.center = get_article_information(section)
            if "right" in section.get_attribute("class"):
                story.right = get_article_information(section)


### Retrieve Article Links for Crawl

In [None]:
driver = webdriver.Chrome()

url = 'https://www.allsides.com/headline-roundups'
wait = WebDriverWait(driver, 10)

### Get brief story information and links

In [None]:
stories = get_brief_stories_on_page(driver)
NUM_PAGES = 3 # TODO adjust for usecase
for i in range(1, NUM_PAGES + 1):
    driver.get(f"{url}?page={i}")
    wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.views-field a')))
    stories.extend(get_brief_stories_on_page(driver))

# display(stories)

### Fill in story information and articles

In [None]:
get_story_details(driver, stories)
# display(stories)

### Saving crawl

In [None]:
import pickle

with open('allsides_crawl.pkl', 'wb') as f:
    pickle.dump(stories, f)