# Deadspin scraper

By [Ben Welsh](https://palewi.re/who-is-ben-welsh/)

This notebook scrapes metadata tracking pageviews of posts on Deadspin. Its results were used by Kim Janssen to write the Los Angeles Times story ["I checked the math of the media bosses who told Deadspin to ‘stick to sports.’ It doesn’t add up."](https://www.latimes.com/entertainment-arts/business/story/2019-11-01/deadspin-stick-to-sports-bad-math)

Import Python tools

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

Walk back from the Deadspin homepage through its archives, downloading every link.

In [None]:
link_list = []

In [None]:
def get_links(url):
    # Get the page's HTML
    print(f"Requesting {url}")
    r = requests.get(url)
    html = r.content
    
    # Scrape out all the links
    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    
    # Add them to our master list
    print(f"Logging {len(links)} links")
    link_list.extend([l['href'] for l in links if l.get("href", None)])
    
    # Grab the next page link and get recursive
    next_url = links[-1]['href']
    get_links(f"https://www.deadspin.com/{next_url}")

In [None]:
get_links("https://www.deadspin.com/")

Dedepulicate the link list

In [None]:
link_set = set(link_list)

Filter down to only deadspin.com links

In [None]:
deadspin_links = [l for l in link_set if 'deadspin.com' in l]

Cut out any tags and links that don't lead to posts.

In [None]:
not_tags = [l for l in deadspin_links if '/c/' not in l and '/tag/' not in l]

Write it out as a CSV

In [None]:
df = pd.DataFrame(not_tags)

In [None]:
df.columns = ["url"]

In [None]:
df.to_csv('data/links.csv', index=False)

Walk through the link list and scrape every URL

In [None]:
cache = {}

In [None]:
def parse_story(url):
    # Skip it if already scraped
    if url in cache.keys():
      print(f"Already got {url}")
      return
    
    # Grab the page
    print(f"Scraping {url}")
    try:
      r = requests.get(url)
    except Exception:
      print(f"Failed to requesst {url}")
      return
    
    # Pull out the HTML
    html = r.content
    soup = BeautifulSoup(html)

    # Scrape out the data we want
    try:
        d = {
          'url': url,
          'headline': soup.find_all("h1")[0].a.text,
          'new_vistors': soup.find("div", {"class", "sc-15g8630-0"})['title'].split()[0],
          'visitors': soup.find("div", {"class", "sc-15g8630-0"}).find_all("span")[1].text,
          'pubdate': soup.find("a", {"class": "js_meta-time"}).text
        }
    except:
        print(f"Failed to scrape {url}")
        return
    # Add it to the cache
    cache[url] = d

In [None]:
result = [parse_story(l) for l in not_tags]

Write out the scraped post data

In [None]:
scrape_df = pd.DataFrame(cache.values())

In [None]:
scrape_df.head()

In [None]:
scrape_df.to_csv('data/posts.csv', index=False)