# Python 101 @ SzISz VII.

---

## Today: Web Scraping II.

### Act I: scrape the hungarian tenders

In [None]:
# import the necessary libraries
import requests
from bs4 import BeautifulSoup

Let's get some nasty data!

In [None]:
# store some general data
BASE_URI = '../data/'
BASE_URL = 'http://kozbeszerzes.ceu.hu'
kozgep_suburl = '/entity/t/10950676.xml'

In [None]:
# download raw data
kozgep_response = requests.get(BASE_URL + kozgep_suburl)
print kozgep_response.status_code

In [None]:
# print what did we get
soup = BeautifulSoup(kozgep_response.content)
print soup.prettify()

In [None]:
# iterate through the tenders
# beautifulsoup can parse xmls too
for tender in soup.find('all_tenders_won').findAll('tender'):
    print BASE_URL + tender['url']

In [None]:
# parse the first tender
tender_response = requests.get(
    BASE_URL + soup.find('all_tenders_won').find('tender')['url']
)
if not tender_response.status_code == 200:
    print 'Tender download failed!'
else:
    tender_soup = BeautifulSoup(tender_response.content)
    print tender_soup.prettify()

In [None]:
# create a function, and get the needed information out of the xml
def get_tenders(base_url, sub_url):
    response = requests.get(base_url + sub_url)
    if not response.status_code == 200:
        print 'Download failed!'
    else:
        won_tenders = [['Year', 'Value', 'Desc']] # init with headers
        soup = BeautifulSoup(response.content)
        for tender in soup.find('all_tenders_won').findAll('tender'):
            tender_response = requests.get(base_url + tender['url'])
            if not tender_response.status_code == 200:
                print 'Tender download failed!'
            else:
                tender_soup = BeautifulSoup(tender_response.content)
                won_tenders.append([
                    tender_soup.find('tender')['year'],
                    tender_soup.find('tender')['estimated_value'],
                    '"' + tender_soup.find('tender')['subject'] + '"' # we use " to make sure that the data is wrapped
                ])
        return won_tenders

In [None]:
# write a save function
# since we have hungarian text, we need to encode our characters in UTF-8
# and unfortunately csv module does not support that
import codecs
def save_results(filename, tenders):
    with codecs.open(filename, 'w', 'utf-8') as output:
        for tender in tenders:
            output.write(u';'.join(tender) + u'\n')

In [None]:
# write a main function
def main():
    save_results(BASE_URI + 'kozgep.csv', get_tenders(BASE_URL, kozgep_suburl))

In [None]:
# execute
main()

---

### Intermission: Creating a standalone script

In [None]:
# Intermission
from IPython.display import YouTubeVideo
YouTubeVideo("O0wOD9TWynM", autoplay=1)

Create a new text file with .py extension! You can specify the filename.
Start it with:  
    `# encoding: utf-8`  
then copy-paste:
    - the imports, 
    - the global variables 
    - the three functions
and insert the following two lines into the end of the file:  
`if __name__ == '__main__':  
     main()`  
Save it, and now you can execute this script by invoking:  
    `python your_specified_filename.py`

In [None]:
# You can even import your newly created script:
import myscript # use your filename

In [None]:
# get it's contents
dir(myscript)

In [None]:
# print its variables
print myscript.BASE_URL

In [None]:
# use its functions
tenders = myscript.get_tenders(myscript.BASE_URL, myscript.kozgep_suburl)

In [None]:
myscript.save_results(BASE_URI + 'kozgep1.csv', tenders)

---

### Act II: Disguise yourself!

Let's pretend to be a browser instead of a script

In [None]:
USER_AGENTS = [
    # Chrome
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
    # Firefox
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
    # Opera
    'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
    # Safari
    'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25',
    # Internet Explorer, probably a good idea to leave this one out...
    'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
]

In [None]:
# write a wrapper function to handle the user-agent string
import random
def get_header(agents):
    return {'User-agent': random.choice(agents)}

Get the main articles from index.hu

In [None]:
url = 'http://index.hu'
index_response = requests.get(url, headers=get_header(USER_AGENTS))

In [None]:
# study the skeleton of the site
soup = BeautifulSoup(index_response.content)
print soup.prettify()

In [None]:
# get the "front page"
index_main = soup.find('section', {'class': 'blokk hajtas-felett dupla-vezeto-blokk cimlap-blokk-index blokk_uj-blokk saved'})

In [None]:
# print the basics about the main articles
for article in index_main.findAll('article'):
    print ''
    # article image if exists
    if article.find('img'):
        print '[', article.find('img').get('src'), ']'
    # title
    print article.find('h1', {'class': 'cikkcim'}).getText()
    # link
    print '<', article.find('h1', {'class': 'cikkcim'}).find('a').get('href'), '>'
    # promo text if exists
    if article.find('p', {'class': 'ajanlo'}):
        print article.find('p', {'class': 'ajanlo'}).getText()
    print '-' * 79

Let's get the article texts, and the list of images for each "main" article!

In [None]:
articles = []
for article in index_main.findAll('article'):
    article_response = requests.get(
        article.find('h1', {'class': 'cikkcim'}).find('a').get('href'),
        headers=get_header(USER_AGENTS)
    )
    soup = BeautifulSoup(article_response.content)
    article_container = soup.find('div', {'class':'cikk-torzs-container'})
    if article_container:
        title = article.find('h1', {'class': 'cikkcim'}).getText()
        text = u'\n'.join([p.getText() for p in article_container.findAll('p')])
        images = [url.get('src') for url in article_container.findAll('img')]
        articles.append([title, text, images])

In [None]:
for article in articles:
    print ''
    print 'Title:', article[0]
    print 'Text:', article[1]
    print 'Images:'
    for img in article[2]:
        print img
    print '-' * 79

### Final Act: Your turn!

Write a script called `youtube.py`, in which you create an object called RelatedTube.
It has an attribute: `base_url` (youtube's base url)
It has three functions: `init`, `get`, and `set`

Init:
    - Arguments: (`self` and) `youtube_video_id`
    - Output: -
    - Workflow: set the `self.video` to `youtube_video_id`
Get:
    - Arguments: `self`
    - Output: the links to the related videos
    - Workflow: 
        * get the `self.video` page
        * parse it for the related links
        * return them in a list
Set:
    - Arguments: (`self` and) `youtube_video_id`
    - Output: -
    - Workflow: set the `self.video` to `youtube_video_id`
Don't forget to hide your a**!!!

In [None]:
# test the script
import youtube

In [None]:
related = youtube.RelatedTube('zkxqRthhwIs')

In [None]:
for video in related.get():
    print video