In [1]:
import requests
from bs4 import BeautifulSoup

Step 1:  Get HTML from url 

In [2]:
url = 'https://www.reddit.com'

# Establish connection to web page
response = requests.get(url)

response.status_code

429

In [3]:
HTML = response.text
HTML

u'\n<!doctype html>\n<html>\n  <head>\n    <title>Too Many Requests</title>\n    <style>\n      body {\n          font: small verdana, arial, helvetica, sans-serif;\n          width: 600px;\n          margin: 0 auto;\n      }\n\n      h1 {\n          height: 40px;\n          background: transparent url(//www.redditstatic.com/reddit.com.header.png) no-repeat scroll top right;\n      }\n    </style>\n  </head>\n  <body>\n    <h1>whoa there, pardner!</h1>\n    \n\n\n<p>we\'re sorry, but you appear to be a bot and we\'ve seen too many requests\nfrom you lately. we enforce a hard speed limit on requests that appear to come\nfrom bots to prevent abuse.</p>\n\n<p>if you are not a bot but are spoofing one via your browser\'s user agent\nstring: please change your user agent string to avoid seeing this message\nagain.</p>\n\n<p>please wait 8 second(s) and try again.</p>\n\n    <p>as a reminder to developers, we recommend that clients make no\n    more than <a href="http://github.com/reddit/redd

Step 2: Parse HTML with BeautifulSoup

In [4]:
soup = BeautifulSoup(HTML, 'lxml')
# the first argument is the variable and the second is the library (tool kit) used to parse it. 

In [5]:
print soup.prettify()

<!DOCTYPE html>
<html>
 <head>
  <title>
   Too Many Requests
  </title>
  <style>
   body {
          font: small verdana, arial, helvetica, sans-serif;
          width: 600px;
          margin: 0 auto;
      }

      h1 {
          height: 40px;
          background: transparent url(//www.redditstatic.com/reddit.com.header.png) no-repeat scroll top right;
      }
  </style>
 </head>
 <body>
  <h1>
   whoa there, pardner!
  </h1>
  <p>
   we're sorry, but you appear to be a bot and we've seen too many requests
from you lately. we enforce a hard speed limit on requests that appear to come
from bots to prevent abuse.
  </p>
  <p>
   if you are not a bot but are spoofing one via your browser's user agent
string: please change your user agent string to avoid seeing this message
again.
  </p>
  <p>
   please wait 8 second(s) and try again.
  </p>
  <p>
   as a reminder to developers, we recommend that clients make no
    more than
   <a href="http://github.com/reddit/reddit/wiki/API">
    

In [6]:
soup.html.title

<title>Too Many Requests</title>

In [7]:
soup.html.title.text

u'Too Many Requests'

In [8]:
soup.findAll('a')

[<a href="http://github.com/reddit/reddit/wiki/API">one\n    request every two seconds</a>]

For each item that we want to scrape, we want to find:

1. The title of the thread                               tag=<a>  attribute = 'data-event-action'=title
2. The subreddit that the thread corresponds to          tag =<a>  attribute: class='subreddit hover may-blank'
3. The length of time it has been up on Reddit           tag = <time>         attribute: class='live-timestamp'
4. The number of comments on the thread                  tag = <a>  attribute: data-event-action = 'comments'

In [9]:
results = soup.findAll('a', {'data-event-action': 'title'})

In [10]:
results

[]

In [11]:
def get_title(html):
    results = soup.findAll('a', {'data-event-action': 'title'})
    titles = []
    for item in results:
        try:
            titles.append(item.text)
        except:
            titles.append('ERROR')
    return titles    
        
        
        
get_title(soup)        

[]

In [12]:
def get_subreddit(html):
    results = soup.findAll('a', {'class': 'subreddit hover may-blank'})
    subreddits = []
    for item in results:
        try:
            subreddits.append(item.text)
        except:
            subreddits.append('ERROR')
    return subreddits    

get_subreddit(soup)

[]

In [13]:
def get_times(html):
    results = soup.findAll('time', {'class': 'live-timestamp'})
    times = []
    for item in results:
        try:
            times.append(item.text)
        except:
            times.append('ERROR')
    return times    

get_times(soup)


[]

In [14]:
def get_comments(html):
    results = soup.findAll('a', {'data-event-action': 'comments'})
    comments = []
    for item in results:
        try:
            comments.append(item.text)
        except:
            comments.append('ERROR')
    return comments    

get_comments(soup)

[]

In [15]:
from selenium import webdriver

Step 1: Use Selenium to get the HTML from multiple web pages

In [16]:
# visit our Reddit page
driver = webdriver.Chrome(executable_path="../../chromedriver")
driver.get("http://www.reddit.com")
# always good to check we've got the page we think we do
print driver.title

reddit: the front page of the internet


In [17]:
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
driver.close()

In [18]:
from time import sleep

In [19]:
print soup.prettify()

<!DOCTYPE html>
<html class=" js cssanimations csstransforms" lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <title>
   reddit: the front page of the internet
  </title>
  <meta content=" reddit, reddit.com, vote, comment, submit " name="keywords"/>
  <meta content="reddit: the front page of the internet" name="description"/>
  <meta content="always" name="referrer"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <link href="/static/opensearch.xml" rel="search" type="application/opensearchdescription+xml"/>
  <link href="https://www.reddit.com/" rel="canonical"/>
  <meta content="width=1024" name="viewport"/>
  <link href="//out.reddit.com" rel="dns-prefetch"/>
  <link href="//out.reddit.com" rel="preconnect"/>
  <link href="//www.redditstatic.com/desktop2x/img/favicon/apple-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
  <link href="//www.redditstatic.com/desktop2x/img/favicon/apple-icon-60x60.png" rel="apple-touch-icon" siz

In [24]:
import re
def getLastID(my_soup):
    return my_soup.find(id=re.compile('thing'))['id'][6:]

In [25]:
# Starting with starter code
url_template = "http://www.reddit.com/?count={}&after={}"
# max_results = 100 # Set this to a high-value (5000) to generate more results. 
# # Crawling more results, will also take much longer. First test your code on a small number of results and then expand.

# results = []

# for start in range(0, max_results, 25):
#     # Grab the results from the request (as above)
#     # Append to the full set of results
#     pass

In [27]:
# https://www.reddit.com/?count=75&after=t3_7s649h
# you might be able to make a for loop for these urls:
url = 'http://www.reddit.com'
for i in range (25, 100, 25):
    driver = webdriver.Chrome(executable_path="../../chromedriver")
    driver.get(url)
    print url
    sleep(1)
    html = driver.page_source
    driver.close()
    soup = BeautifulSoup(html, 'lxml')
    new_id  = getLastID(soup)
    print new_id
    url = url_template.format(i, new_id)
    

http://www.reddit.com
t3_7rd2ya


IndexError: tuple index out of range