# AllSides sources & bias crawler

Get and save a list of rated news sources as left or right and in between.

A CSV file will be created with the following columns:

- Source
- Label
- Agree
- Disagree
- Publisher URL
- Publisher site

In [1]:
!ipython -m pip install aiohttp bs4 requests



In [2]:
import asyncio
import csv
import logging
import re
import urllib.parse as urlparse

import aiohttp

import bs4
import requests

In [3]:
url_tpl = "https://www.allsides.com/media-bias/media-bias-ratings?field_featured_bias_rating_value=All&field_news_source_type_tid%5B1%5D=1&field_news_source_type_tid%5B2%5D=2&field_news_source_type_tid%5B3%5D=3&field_news_bias_nid_1%5B1%5D=1&field_news_bias_nid_1%5B2%5D=2&field_news_bias_nid_1%5B3%5D=3&title=&customFilter=1&page={}"
html_parser = "html5lib"
csv_header = [
    "source",
    "label",
    "agree",
    "disagree",
    "publisher",
    "site",
]
dump_path = "media-bias.csv"
encoding = "utf-8"
skip_blocked_sites = True

verbose = True  # make it True to see debugging messages
level = logging.DEBUG if verbose else logging.INFO
logging.root.handlers.clear()
logging.basicConfig(
    format="%(levelname)s - %(name)s - %(asctime)s - %(message)s",
    level=level
)

site_adapter = {
    "www.huffingtonpost.com": "www.huffpost.com",
    "www.cnn.com": "edition.cnn.com",
    "online.wsj.com": "wsj.com",
    "www.nationalreview.com": "nationalreview.com",
}


async def get_soup(session, url):
    abs_url = urlparse.urljoin(url_tpl, url)
    text = await (await session.get(abs_url)).text()
#     resp.raise_for_status()
    soup = bs4.BeautifulSoup(text, html_parser)
    return soup


def _adapt_site(url, netloc):
    site = site_adapter.get(netloc)
    if site:
        url = url.replace(netloc, site)
        netloc = site
    return url, netloc


async def get_publisher_url(session, src_url, source_name):
#     import code; code.interact(local={**globals(), **locals()})
    logging.debug("Getting publisher's URL for %r.", source_name)
    soup = await get_soup(session, src_url)
    div = soup.find("div", class_="dynamic-grid")
    if not div:
        return None
    
    url = div.find("a").get("href").strip()
    parsed = urlparse.urlparse(url)
    if not parsed.netloc:
        return None
    
    return _adapt_site(url, parsed.netloc)


async def save_pages(bias_writer, csvfile):
    async with aiohttp.ClientSession() as session:
        page = 0  # custom page if you want
        while True:
            logging.info("Crawling page %d...", page)
            url = url_tpl.format(page)
            soup = await get_soup(session, url)
            
            pub_coros = []
            extras = []
            table = soup.find("table")
            if not table or "no record" in table.find("tbody").find("tr").text.lower():
                logging.info("Reached empty table -> end of results/pages.")
                break
                
            for row in table.find("tbody").find_all("tr"):
                src_a = row.find("td", class_="source-title").find("a")
                src_url = src_a.get("href")
                source_name = src_a.text
                label_alt = row.find("td", class_="views-field-field-bias-image").find("img").get("alt")
                label = label_alt.split(":")[-1].strip()
                feedback = row.find("td", class_="community-feedback")
                agree = int(feedback.find("span", class_="agree").text)
                disagree = int(feedback.find("span", class_="disagree").text)
                
                extras.append([source_name, label, agree, disagree])
#                 import code; code.interact(local={**globals(), **locals()})
                pub_coros.append(get_publisher_url(session, src_url, source_name))
            
            publisher_details_list = await asyncio.gather(*pub_coros)
            for idx, publisher_details in enumerate(publisher_details_list):
                if not publisher_details:
                    if skip_blocked_sites:
                        continue
                    else:
                        publisher_details = ("", "")

#                 print(source_name, label, f"{agree}/{disagree}")
                bias_writer.writerow(extras[idx] + list(publisher_details))

            page += 1
            csvfile.flush()


async def main():
    with open(dump_path, "w", newline="", encoding=encoding) as csvfile:
        bias_writer = csv.writer(csvfile)
        bias_writer.writerow(csv_header)
        await save_pages(bias_writer, csvfile)
        
        
await main()

INFO - root - 2020-04-20 12:57:39,979 - Crawling page 0...
DEBUG - root - 2020-04-20 12:57:42,810 - Getting publisher's URL for 'AARP'.
DEBUG - root - 2020-04-20 12:57:42,811 - Getting publisher's URL for 'ABC News'.
DEBUG - root - 2020-04-20 12:57:42,816 - Getting publisher's URL for 'Above The Law'.
DEBUG - root - 2020-04-20 12:57:42,819 - Getting publisher's URL for 'Abridge News'.
DEBUG - root - 2020-04-20 12:57:42,822 - Getting publisher's URL for 'Accuracy in Media '.
DEBUG - root - 2020-04-20 12:57:42,824 - Getting publisher's URL for 'ACLU'.
DEBUG - root - 2020-04-20 12:57:42,865 - Getting publisher's URL for 'AJ+'.
DEBUG - root - 2020-04-20 12:57:42,870 - Getting publisher's URL for 'Al Cardenas'.
DEBUG - root - 2020-04-20 12:57:42,871 - Getting publisher's URL for 'Al Jazeera'.
DEBUG - root - 2020-04-20 12:57:42,874 - Getting publisher's URL for 'AllSides'.
DEBUG - root - 2020-04-20 12:57:42,875 - Getting publisher's URL for 'AllSides Community'.
DEBUG - root - 2020-04-20 12:

INFO - root - 2020-04-20 12:57:58,224 - Crawling page 3...
DEBUG - root - 2020-04-20 12:58:00,511 - Getting publisher's URL for 'Chicago Sun-Times'.
DEBUG - root - 2020-04-20 12:58:00,513 - Getting publisher's URL for 'Chicago Tribune'.
DEBUG - root - 2020-04-20 12:58:00,514 - Getting publisher's URL for 'Children’s Defense Fund'.
DEBUG - root - 2020-04-20 12:58:00,517 - Getting publisher's URL for 'Chip Bok (cartoonist)'.
DEBUG - root - 2020-04-20 12:58:00,519 - Getting publisher's URL for 'Chris Britt (cartoonist)'.
DEBUG - root - 2020-04-20 12:58:00,523 - Getting publisher's URL for 'Chris Ruddy'.
DEBUG - root - 2020-04-20 12:58:00,528 - Getting publisher's URL for 'Christian Science Monitor'.
DEBUG - root - 2020-04-20 12:58:00,534 - Getting publisher's URL for 'Christiane Amanpour'.
DEBUG - root - 2020-04-20 12:58:00,537 - Getting publisher's URL for 'Christianity Today'.
DEBUG - root - 2020-04-20 12:58:00,541 - Getting publisher's URL for 'Christopher Buskirk'.
DEBUG - root - 2020

DEBUG - root - 2020-04-20 12:58:20,004 - Getting publisher's URL for 'Federation of American Scientists'.
DEBUG - root - 2020-04-20 12:58:20,008 - Getting publisher's URL for 'Financial Times'.
DEBUG - root - 2020-04-20 12:58:20,012 - Getting publisher's URL for 'Fiscal Times'.
DEBUG - root - 2020-04-20 12:58:20,013 - Getting publisher's URL for 'FiveThirtyEight'.
DEBUG - root - 2020-04-20 12:58:20,016 - Getting publisher's URL for 'Food Democracy Now'.
DEBUG - root - 2020-04-20 12:58:20,019 - Getting publisher's URL for 'Forbes'.
DEBUG - root - 2020-04-20 12:58:20,022 - Getting publisher's URL for 'Foreign Affairs'.
DEBUG - root - 2020-04-20 12:58:20,024 - Getting publisher's URL for 'Foreign Policy'.
DEBUG - root - 2020-04-20 12:58:20,025 - Getting publisher's URL for 'Fox News Latino'.
DEBUG - root - 2020-04-20 12:58:20,027 - Getting publisher's URL for 'Fox News Opinion'.
DEBUG - root - 2020-04-20 12:58:20,029 - Getting publisher's URL for 'Fox Online News'.
DEBUG - root - 2020-04-

DEBUG - root - 2020-04-20 12:58:35,833 - Getting publisher's URL for 'John K. Herr'.
DEBUG - root - 2020-04-20 12:58:35,835 - Getting publisher's URL for 'John Pudner'.
DEBUG - root - 2020-04-20 12:58:35,837 - Getting publisher's URL for 'John Stossel'.
DEBUG - root - 2020-04-20 12:58:35,840 - Getting publisher's URL for 'Jon Terbush'.
DEBUG - root - 2020-04-20 12:58:35,842 - Getting publisher's URL for 'Jonah Goldberg'.
DEBUG - root - 2020-04-20 12:58:35,845 - Getting publisher's URL for 'Jonathan Chait'.
DEBUG - root - 2020-04-20 12:58:35,848 - Getting publisher's URL for 'Jonathan Haidt'.
DEBUG - root - 2020-04-20 12:58:35,850 - Getting publisher's URL for 'Jonathan Miller'.
DEBUG - root - 2020-04-20 12:58:35,852 - Getting publisher's URL for "Journalist's Resource".
DEBUG - root - 2020-04-20 12:58:35,854 - Getting publisher's URL for 'Juan Williams'.
DEBUG - root - 2020-04-20 12:58:35,856 - Getting publisher's URL for 'Jubilee Media'.
DEBUG - root - 2020-04-20 12:58:35,858 - Gettin

DEBUG - root - 2020-04-20 12:58:51,205 - Getting publisher's URL for 'NBC Today Show'.
DEBUG - root - 2020-04-20 12:58:51,208 - Getting publisher's URL for 'Neal K. Katyal'.
DEBUG - root - 2020-04-20 12:58:51,210 - Getting publisher's URL for 'Neil J. Young'.
DEBUG - root - 2020-04-20 12:58:51,213 - Getting publisher's URL for 'New Economy Working Group'.
DEBUG - root - 2020-04-20 12:58:51,218 - Getting publisher's URL for 'New Hampshire Union Leader'.
DEBUG - root - 2020-04-20 12:58:51,219 - Getting publisher's URL for 'New Republic'.
DEBUG - root - 2020-04-20 12:58:51,222 - Getting publisher's URL for 'New York Daily News'.
DEBUG - root - 2020-04-20 12:58:51,224 - Getting publisher's URL for 'New York Magazine'.
DEBUG - root - 2020-04-20 12:58:51,226 - Getting publisher's URL for 'New York Post'.
DEBUG - root - 2020-04-20 12:58:51,229 - Getting publisher's URL for 'New York Times - News'.
DEBUG - root - 2020-04-20 12:58:51,234 - Getting publisher's URL for 'New York Times - Opinion'.

DEBUG - root - 2020-04-20 12:59:07,860 - Getting publisher's URL for 'Rob Rogers (cartoonist)'.
DEBUG - root - 2020-04-20 12:59:07,862 - Getting publisher's URL for 'Robert Ariail (cartoonist)'.
DEBUG - root - 2020-04-20 12:59:07,867 - Getting publisher's URL for 'Robert Samuelson'.
DEBUG - root - 2020-04-20 12:59:07,870 - Getting publisher's URL for 'Rod Blagojevich'.
DEBUG - root - 2020-04-20 12:59:07,873 - Getting publisher's URL for 'Roll Call'.
DEBUG - root - 2020-04-20 12:59:07,876 - Getting publisher's URL for 'RollingStone.com'.
DEBUG - root - 2020-04-20 12:59:07,880 - Getting publisher's URL for 'Ross Douthat'.
DEBUG - root - 2020-04-20 12:59:07,882 - Getting publisher's URL for 'Ruth Marcus'.
DEBUG - root - 2020-04-20 12:59:07,884 - Getting publisher's URL for 'Ryan Cooper'.
DEBUG - root - 2020-04-20 12:59:07,885 - Getting publisher's URL for 'Ryan Cooper'.
DEBUG - root - 2020-04-20 12:59:07,887 - Getting publisher's URL for 'S.E. Cupp'.
DEBUG - root - 2020-04-20 12:59:07,889

DEBUG - root - 2020-04-20 12:59:23,433 - Getting publisher's URL for 'The Observer (New York)'.
DEBUG - root - 2020-04-20 12:59:23,435 - Getting publisher's URL for 'The Oracle'.
DEBUG - root - 2020-04-20 12:59:23,439 - Getting publisher's URL for 'The Philadelphia Inquirer'.
DEBUG - root - 2020-04-20 12:59:23,441 - Getting publisher's URL for 'The Plebeian'.
DEBUG - root - 2020-04-20 12:59:23,444 - Getting publisher's URL for 'The Post Millennial'.
DEBUG - root - 2020-04-20 12:59:23,446 - Getting publisher's URL for 'The Red and Black'.
DEBUG - root - 2020-04-20 12:59:23,448 - Getting publisher's URL for 'The Reliable Bias'.
DEBUG - root - 2020-04-20 12:59:23,450 - Getting publisher's URL for 'The Republican'.
DEBUG - root - 2020-04-20 12:59:23,452 - Getting publisher's URL for 'The Resurgent'.
DEBUG - root - 2020-04-20 12:59:23,456 - Getting publisher's URL for 'The Root'.
DEBUG - root - 2020-04-20 12:59:23,458 - Getting publisher's URL for 'The Sacramento Bee'.
DEBUG - root - 2020-0

Some publishers are blocked (no websites offered by AllSides), therefore fewer results in the CSV file.

Now let's find a good way of associating a side with a website in case multiple candidates are available.

In [4]:
side_dict = {}

with open(dump_path, newline="") as stream:
    reader = csv.reader(stream)
    print(next(reader))
    
    for row in reader:
        side_dict.setdefault(row[5], []).append((row[0], row[1], row[2]))

for site, sides in side_dict.items():
    if len(sides) > 1:
        print(site, sides)

['source', 'label', 'agree', 'disagree', 'publisher', 'site']
edition.cnn.com [('CNN (Web News)', 'Lean Left', '31638'), ('CNN - Editorial', 'Left', '13083')]
www.thedailybeast.com [('Daily Beast', 'Left', '10254'), ('Newsweek', 'Lean Left', '2307')]
www.foxnews.com [('Fox News Opinion', 'Right', '10949'), ('Fox Online News', 'Lean Right', '26531')]
www.courier-journal.com [('Louisville Courier-Journal', 'Lean Left', '207'), ('The Courier-Journal', 'Lean Left', '145')]
www.nytimes.com [('New York Times - News', 'Lean Left', '18539'), ('New York Times - Opinion', 'Left', '5012')]
www.newsmax.com [('Newsmax - News', 'Lean Right', '7095'), ('Newsmax - Opinion', 'Right', '74')]
www.npr.org [('NPR Editorial ', 'Lean Left', '3029'), ('NPR Online News', 'Center', '21728')]
theweek.com [('The Week - News', 'Center', '3256'), ('The Week - Opinion', 'Lean Left', '49')]
wsj.com [('Wall Street Journal - Editorial', 'Lean Right', '6216'), ('Wall Street Journal - News', 'Center', '14288')]
