# Python 101 @ SzISz VI.

---

## Previously on Python 101: Exceptions

In [1]:
# ZeroDivisionError
try:
    print 3/0
except ZeroDivisionError:
    print 'No one can divide with 0.'

No one can divide with 0.


In [2]:
# ValueError
try:
    print 3/'2'
except TypeError:
    print 'Can\'t divide a number with a character!'

Can't divide a number with a character!


In [3]:
# ValueError
try:
    print int('string')
except ValueError:
    print 'This string is not a number!'

This string is not a number!


In [4]:
# NameError    
try:
    print spam
except NameError:
    print 'There is no such thing as \'spam\'!'

There is no such thing as 'spam'!


In [5]:
# IndexError
try:
    mylist = [1, 2, 3]
    print mylist[len(mylist)]
except IndexError:
    print 'Index is larger then the length of the list!'

Index is larger then the length of the list!


In [6]:
# KeyError
try:
    mydict = {'a': 1, 'b': 2}
    print mydict['c']
except KeyError:
    print 'Key not exists!'

Key not exists!


In [7]:
# IOError
try:
    not_existing_filename = 'a_file_that_is_not_exists.txt'
    myfile = open(not_existing_filename, 'r')
    myfile.readlines()
except IOError:
    print 'The specified file does not exist!'

The specified file does not exist!


In [8]:
# try-except-else-finally
try:
    print 'Hello', # 3/0
except:
    print 'Print failed',
else:
    print 'World',
finally:
    print '!'

Hello World !


---

## Today on Python 101: Web scraping

### 1. Obtain a webpage

In [10]:
# import a 3rd party library called requests
import requests

In [14]:
existing_url = 'http://146.110.53.143:8000/test.html'
response = requests.get(existing_url)
print response.status_code # hopefully 200 -> successful download

200


In [15]:
not_existing_url = 'http://146.110.53.143:8000/test1.html'
response = requests.get(not_existing_url)
print response.status_code # unfortunately 404 -> not exists
# Other possible values: 
# - 303 (redirect)
# - 301 (permanent redirect)
# - 400 (bad request)
# - 401 (unauthorized)

404


In [16]:
response = requests.get(existing_url)
print response.content

<!DOCTYPE html>
<html>
    <head>
        <title>Test page for web crawling</title>
    </head>
    <body>
        <section id="main_content">
            <h1>Title of the page</h1>
            <h2>Subtitle of the page</h2>
            <div id="first">
                <p class="important">Important text 1</p>
                <p>Not important text 1</p>
                <p class="important">Important text 2</p>
                <p>Not important text 2</p>
                <p class="important">Important text 3</p>
                <p>Not important text 3</p>
                <p class="important">Important text 4</p>
                <p>Not important text 4</p>
                <p class="important">Important text 5</p>
                <p>Not important text 5</p>
                <p class="important">Important text 6</p>
                <p>Not important text 6</p>
                <p class="important">Important text 7</p>
                <p>Not important text 7</p>
            </div>
            <h

In [17]:
from IPython.display import HTML
# Render page if successfully downloaded
if response.status_code == 200:
    result = HTML(response.content)
else:
    result = 'Nah, let\'s have a beer instead!'

In [18]:
result

### 2. Process HTML

####Story time: The skeleton of a html document

<b>HTML</b> is a markup language, its basic build blocks are the <code>&lt;tag></code>s.<br>
(Almost) every <code>&lt;tag></code> has two parts:

    - Opening <tag>
    - Closing </tag>

Important html <code>&lt;tag></code>s:

    - <html></html>
    - <head></head>
    - <body></body>
    - <div></div>
    - <p></p>
    - <span></span>
    - <section></section>
    - <a href=""></a>
    - <img src="">
    - <br>
    - <table>
        <thead>
            <tr>
                <th></th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td></td>
                ...
            </tr>
        </tbody>
     </table>
    - <ul></ul> / <ol></ol> + <li></li>
    
Tags can have different attributes:

    - <a>: href
    - <img>: src
    - id
    - class
    - anything that is not a html keyword
    

#### Let's parse it!

Required module: `BeautifulSoup4`

In [9]:
# test if it is working
from bs4 import BeautifulSoup

In [20]:
# create a soup:
soup = BeautifulSoup(response.content)

In [21]:
print soup.prettify()

<!DOCTYPE html>
<html>
 <head>
  <title>
   Test page for web crawling
  </title>
 </head>
 <body>
  <section id="main_content">
   <h1>
    Title of the page
   </h1>
   <h2>
    Subtitle of the page
   </h2>
   <div id="first">
    <p class="important">
     Important text 1
    </p>
    <p>
     Not important text 1
    </p>
    <p class="important">
     Important text 2
    </p>
    <p>
     Not important text 2
    </p>
    <p class="important">
     Important text 3
    </p>
    <p>
     Not important text 3
    </p>
    <p class="important">
     Important text 4
    </p>
    <p>
     Not important text 4
    </p>
    <p class="important">
     Important text 5
    </p>
    <p>
     Not important text 5
    </p>
    <p class="important">
     Important text 6
    </p>
    <p>
     Not important text 6
    </p>
    <p class="important">
     Important text 7
    </p>
    <p>
     Not important text 7
    </p>
   </div>
   <h3>
    Sub-subtitle of the page
   </h3>
   <div id="li

In [22]:
# get the title of the document
print soup.title

<title>Test page for web crawling</title>


In [23]:
# get the title text
print soup.title.getText()

Test page for web crawling


In [24]:
# get the text-only version of the page
print soup.getText()



Test page for web crawling



Title of the page
Subtitle of the page

Important text 1
Not important text 1
Important text 2
Not important text 2
Important text 3
Not important text 3
Important text 4
Not important text 4
Important text 5
Not important text 5
Important text 6
Not important text 6
Important text 7
Not important text 7

Sub-subtitle of the page

Important link 1
Not important link 1
Important link 2
Not important link 2
Important link 3
Not important link 3
Important link 4
Not important link 4
Important link 5
Not important link 5
Important link 6
Not important link 6
Important link 7
Not important link 7
Important link 8
Not important link 8

Sub-sub-subtitle of the page




















Fake true elements
It's not important.
Fake important link






In [25]:
# get all the links
soup.findAll('a')

[<a href="important_part.something.com">Important link 1</a>,
 <a href="unimportant_link.something.com">Not important link 1</a>,
 <a href="important_part.something_else.com">Important link 2</a>,
 <a href="unimportant_link.something_else.com">Not important link 2</a>,
 <a href="important_part.not_something.com">Important link 3</a>,
 <a href="unimportant_link.not_something.com">Not important link 3</a>,
 <a href="important_part.but_it_is.something.com">Important link 4</a>,
 <a href="unimportant_link.but_it_is.something.com">Not important link 4</a>,
 <a href="important_part.but_it_isnt.something.com">Important link 5</a>,
 <a href="unimportant_link.but_it_isnt.something.com">Not important link 5</a>,
 <a href="important_part.whatif.something.com">Important link 6</a>,
 <a href="unimportant_link.whatif.something.com">Not important link 6</a>,
 <a href="important_part.notthatkindof.something.com">Important link 7</a>,
 <a href="unimportant_link.notthatkindof.something.com">Not importan

In [26]:
# get the actual urls
for url in soup.findAll('a'):
    print url.get('href')

important_part.something.com
unimportant_link.something.com
important_part.something_else.com
unimportant_link.something_else.com
important_part.not_something.com
unimportant_link.not_something.com
important_part.but_it_is.something.com
unimportant_link.but_it_is.something.com
important_part.but_it_isnt.something.com
unimportant_link.but_it_isnt.something.com
important_part.whatif.something.com
unimportant_link.whatif.something.com
important_part.notthatkindof.something.com
unimportant_link.notthatkindof.something.com
important_part.else.com
unimportant_link.else.com
important_part.something.com


In [27]:
# store the important links
important_urls = []
for url in soup.findAll('a'):
    if 'important_part' in url.get('href'):
        important_urls.append(url.get('href'))
print important_urls

['important_part.something.com', 'important_part.something_else.com', 'important_part.not_something.com', 'important_part.but_it_is.something.com', 'important_part.but_it_isnt.something.com', 'important_part.whatif.something.com', 'important_part.notthatkindof.something.com', 'important_part.else.com', 'important_part.something.com']


In [28]:
# select every paragraph which has "important" class
soup.findAll('p', {'class': 'important'})

[<p class="important">Important text 1</p>,
 <p class="important">Important text 2</p>,
 <p class="important">Important text 3</p>,
 <p class="important">Important text 4</p>,
 <p class="important">Important text 5</p>,
 <p class="important">Important text 6</p>,
 <p class="important">Important text 7</p>,
 <p class="important">It's not important.</p>]

In [31]:
# Whooops, something's going on! Let's investigate!
important_paragraphs = soup.findAll('p', {'class': 'important'})
# print the result text, and its parent's id
for p in important_paragraphs:
    print p.getText(), p.parent.get('id')

Important text 1 first
Important text 2 first
Important text 3 first
Important text 4 first
Important text 5 first
Important text 6 first
Important text 7 first
It's not important. not_main_section


In [36]:
# We can see, that the "fake" result is from somewhere else
print soup.find(id='not_main_section')

<section id="not_main_section" style="display: none;">
<h3>Fake true elements</h3>
<p class="important">It's not important.</p>
<a href="important_part.something.com">Fake important link</a>
<img class="nice" my_attribute="10" src="http://lorempixel.com/400/200/"/><br/>
</section>


In [37]:
# We have a hidden fake section! Let's modify our search!
soup.find(id='main_content').findAll('p', {'class': 'important'})

[<p class="important">Important text 1</p>,
 <p class="important">Important text 2</p>,
 <p class="important">Important text 3</p>,
 <p class="important">Important text 4</p>,
 <p class="important">Important text 5</p>,
 <p class="important">Important text 6</p>,
 <p class="important">Important text 7</p>]

In [38]:
# Let's have the "nice" pictures from the div with random_images_1 class!
soup.find(id='main_content').find('div', {'class': 'random_images_1'}).findAll('img', {'class': 'nice'})

[<img class="nice" my_attribute="10" src="http://lorempixel.com/400/200/"/>,
 <img class="not nice" my_attribute="20" src="http://lorempixel.com/400/200/"/>,
 <img class="nice" my_attribute="11" src="http://lorempixel.com/400/200/"/>,
 <img class="not nice" my_attribute="21" src="http://lorempixel.com/400/200/"/>,
 <img class="nice" my_attribute="12" src="http://lorempixel.com/400/200/"/>,
 <img class="not nice" my_attribute="22" src="http://lorempixel.com/400/200/"/>]

In [39]:
# Whoops again. Filter out the result we don't like.
imgs = soup.find(id='main_content').find('div', {'class': 'random_images_1'}).findAll('img', {'class': 'nice'})
nice_imgs = []
for img in imgs:
    if 'not' not in img['class']:
        nice_imgs.append(img['src'])
print nice_imgs

['http://lorempixel.com/400/200/', 'http://lorempixel.com/400/200/', 'http://lorempixel.com/400/200/']


### 3. It's your turn

Save every important link to a file

In [25]:
BASE_URI = '../data/'
filename = 'important_urls.txt'
links = soup.findAll('a')
urls = []
# get the actual urls
for url in links:
    if 'Fake' not in url.getText() and 'Not' not in url.getText():
         urls.append(url.get('href'))

print urls
with open(BASE_URI + filename, 'w') as targetfile:
    for url in urls:
        targetfile.write(url)

Let's get a random img/gif url from 9gag!

In [22]:
# get the page from http://9gag.com/random
# find the imgages, and get the src attribute's value
# animated img's class: badge-item-animated-img
# not animated img's class: badge-item-img
# hint: multiple class condition with a list {'class': ['val1', 'val2', ...]}
URI = "http://9gag.com/random"
response = requests.get(URI)
if response.status_code == 200:
    soup = BeautifulSoup(response.content)
    img = soup.find('img', {'class': 'badge-item-img'})
    gif = soup.find('div', {'class': 'badge-animated-container-animated'})
    if img is not None:
        print img.get('src')
    if gif is not None:
        print gif.get('data-image')

http://img-9gag-fun.9cache.com/photo/aQ4rAQq_460s.jpg
http://img-9gag-fun.9cache.com/photo/aQ4rAQq_460sa.gif


Put the previous code into a function with two arguments: number of img urls, and output filename

In [24]:
def i_want_fun(output, times=5):
    with open(output, 'w') as fptr:
        for _ in xrange(times): 
            URI = "http://9gag.com/random"
            response = requests.get(URI)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content)
                img = soup.find('img', {'class': 'badge-item-img'})
                gif = soup.find('div', {'class': 'badge-animated-container-animated'})
                if img is not None:
                    fptr.write(img.get('src') + '\n')
                if gif is not None:
                    fptr.write(gif.get('data-image') + '\n')

In [26]:
i_want_fun(BASE_URI+'fun.txt')

Create a class from the previous function. 
The class should store all of the img urls.
The class should have a method:
 - called `crawl` which crawls one random 9gag page
 - called `crawl_multiple` which crawls a number (given as argument) of 9gag pages
 - called `show_urls` which prints out the crawled urls
 - called `export` which saves the urls into a file (filename is given as argument)
 - called `reset` which empties the urls

In [29]:
class IWantFun(object):
    
    URI = "http://9gag.com/random"
    
    def __init__(self):
        self.urls = []
    
    def crawl(self):
        response = requests.get(self.URI)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content)
            img = soup.find('img', {'class': 'badge-item-img'})
            gif = soup.find('div', {'class': 'badge-animated-container-animated'})
            if img is not None:
                self.urls.append(img.get('src'))
            if gif is not None:
                self.urls.append(gif.get('data-image'))
    
    def crawl_multiple(self, times=5):
        for _ in xrange(times):
            self.crawl()
    
    def show_urls(self):
        print 'URLS:'
        print '-'*30
        print '\n'.join(self.urls)
        print '-'*30
    
    def export(self, output):
        with open(output, 'w') as fptr:
            fptr.write('\n'.join(self.urls))
    
    def reset(self):
        self.urls = []

In [30]:
nine = IWantFun()
nine.crawl()
nine.show_urls()
nine.crawl_multiple(5)
nine.show_urls()
nine.export(BASE_URI + 'fun.txt')
nine.reset()
nine.show_urls()

URLS:
------------------------------
http://img-9gag-fun.9cache.com/photo/ao0Le7g_700b_v1.jpg
------------------------------
URLS:
------------------------------
http://img-9gag-fun.9cache.com/photo/ao0Le7g_700b_v1.jpg
http://img-9gag-fun.9cache.com/photo/aZx2Dn9_700b.jpg
http://img-9gag-fun.9cache.com/photo/aLPbQM6_700b_v1.jpg
http://img-9gag-fun.9cache.com/photo/aArZw5g_700b.jpg
http://img-9gag-fun.9cache.com/photo/aWW9mvq_700b.jpg
------------------------------
URLS:
------------------------------

------------------------------
