# Default arguments

In [8]:
def default_arg(x, exponent=2):
    return x**exponent

In [3]:
default_arg(2)

4

In [4]:
default_arg(2, 3)

8

In [9]:
default_arg(2, exponent = 5)

32

In [12]:
def return_two():
    return 1, 2

In [11]:
return_two()

(1, 2)

In [13]:
foo = 12
bar = 42

In [14]:
foo, bar = bar, foo

In [15]:
foo, bar

(42, 12)

In [16]:
one, two, three = range(1, 4)

In [17]:
one, two, three

(1, 2, 3)

# Dictionaries

In [19]:
my_dict = {1000: 'a', 1024: 'b'}
my_dict

{1000: 'a', 1024: 'b'}

In [20]:
my_dict[1000]

'a'

In [21]:
my_dict.keys()

[1000, 1024]

In [22]:
my_dict.values()

['a', 'b']

In [23]:
my_dict[500] = []

In [24]:
my_dict

{500: [], 1000: 'a', 1024: 'b'}

In [25]:
hash(12)

12

In [26]:
hash(2.3)

2523358617

In [27]:
hash('hello')

840651671246116861

In [28]:
hash((1, 'foo'))

6617818165100668875

In [29]:
hash([1, 'foo'])

TypeError: unhashable type: 'list'

In [30]:
hash({'foo': 12})

TypeError: unhashable type: 'dict'

In [35]:
fns = {'sum': sum,
       'len': len}

In [36]:
fns['sum']([1,2,3])

6

In [38]:
fns['len']([1,2,3])

3

In [40]:
my_dict.pop(500)

[]

In [42]:
my_dict

{1000: 'a', 1024: 'b'}

In [43]:
my_dict['foo']

KeyError: 'foo'

In [45]:
my_dict.get('foo', "Not Here!")

'Not Here!'

In [46]:
my_dict

{1000: 'a', 1024: 'b'}

In [47]:
my_dict.setdefault(5, 5*5)

25

In [76]:
my_dict

{5: 25, 1000: 'a', 1024: 'b'}

In [78]:
my_dict.setdefault(5, 5*5 - 1)

25

In [49]:
5 in my_dict

True

# Dict comprehensions

In [50]:
[i*2 for i in [1,2,3]]

[2, 4, 6]

In [51]:
{i: i**2 for i in [2,3,4]}

{2: 4, 3: 9, 4: 16}

# Special dictionaries

In [52]:
from collections import defaultdict, Counter

In [53]:
int

int

In [54]:
int()

0

In [55]:
float()

0.0

In [56]:
count_dict = defaultdict(int)

In [57]:
count_dict['apple']

0

In [58]:
count_dict['orange'] += 1

In [60]:
count_dict

defaultdict(int, {'apple': 0, 'orange': 1})

In [61]:
counter = Counter([1,2,2,2,3,3,5])

In [62]:
counter

Counter({1: 1, 2: 3, 3: 2, 5: 1})

In [63]:
counter.most_common(2)

[(2, 3), (3, 2)]

# Gotchas

In [64]:
def add_one(some_list):
    some_list.append(1)

In [65]:
my_list = []

In [66]:
add_one(my_list)

In [67]:
my_list

[1]

In [69]:
def default_list(li = []):
    li.append(1)
    return li

In [70]:
default_list()

[1]

In [71]:
default_list()

[1, 1]

In [72]:
def better_list(li = None):
    if li == None:
        li = []
    li.append(1)
    return li

In [73]:
better_list(), better_list(), better_list()

([1], [1], [1])

In [74]:
my_dict[[1,2]]

TypeError: unhashable type: 'list'

In [75]:
{sum: 5}

{<function sum>: 5}

In [81]:
def my_sum(*args):
    return sum(args)

In [82]:
my_sum(1)

1

In [83]:
my_sum(1,2,3,4,5,6)

21

In [84]:
range(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [85]:
my_sum(*range(10))

45

In [86]:
def print_many(*args):
    for i in args:
        print i

In [87]:
print_many(1,4,'foo')

1
4
foo


In [89]:
def apply_to_many(fn, *args):
    return fn(args)

In [91]:
apply_to_many(sum, 1, 2, 3)

6

In [92]:
def many_named_args(**args):
    print args

In [94]:
many_named_args(arg1=42, arg2=9, name="James")

{'arg1': 42, 'arg2': 9, 'name': 'James'}


In [95]:
def wrapped(fn, *args, **kwargs):
    if fn == sum or fn == len:
        return fn(*args, **kwargs)
    else:
        return None

In [96]:
wrapped(sum, [1,2,3])

6

In [97]:
wrapped(int, 3, key=5)

# Decorators

In [100]:
def cached(fn):
    result_cache = {}
    def inner(*args):
        print result_cache
        if args in result_cache:
            return result_cache[args]
        else:
            result = fn(*args)
            result_cache[args] = result
            return result
    return inner

In [101]:
my_cached_sum = cached(my_sum)

In [102]:
my_cached_sum(1,2,3)

{}


6

In [103]:
my_cached_sum(1,2,3)

{(1, 2, 3): 6}


6

In [104]:
@cached
def my_cached_sum(*args):
    return sum(args)

In [105]:
def my_cached_sum(*args):
    return sum(args)

my_cached_sum = cached(my_cached_sum)

# More on reading .csv

In [106]:
import csv

In [107]:
data = []
with open('data/trends.csv') as trends:
    reader = csv.DictReader(trends)
    for i in reader:
        data.append(i)

In [108]:
data[:5]

[{'Week': '2011-12-04',
  'big data': '14',
  'data science': '9',
  'machine learning': '18'},
 {'Week': '2011-12-11',
  'big data': '13',
  'data science': '9',
  'machine learning': '17'},
 {'Week': '2011-12-18',
  'big data': '13',
  'data science': '6',
  'machine learning': '15'},
 {'Week': '2011-12-25',
  'big data': '11',
  'data science': '5',
  'machine learning': '12'},
 {'Week': '2012-01-01',
  'big data': '15',
  'data science': '8',
  'machine learning': '13'}]

# JSON

In [109]:
import json

In [110]:
json.loads("""
{"foo": 12,
 "bar": [1,2,5]}""")

{u'bar': [1, 2, 5], u'foo': 12}

In [111]:
data = json.loads("""
{"foo": 12,
 "bar": [1,2,5]}""")

In [113]:
data, type(data)

({u'bar': [1, 2, 5], u'foo': 12}, dict)

In [116]:
print json.dumps(data, indent=2)

{
  "foo": 12, 
  "bar": [
    1, 
    2, 
    5
  ]
}


# Objects

In [117]:
class Person(object):
    pass

In [118]:
kirk = Person()

In [119]:
kirk

<__main__.Person at 0x10b57a350>

In [120]:
kirk.firstname = "James"

In [122]:
kirk.middlename = "Tiberius"

In [123]:
kirk.lastname = "Kirk"

In [124]:
kirk.firstname

'James'

In [125]:
spock = Person()

In [126]:
spock.lastname

AttributeError: 'Person' object has no attribute 'lastname'

In [127]:
class BetterPerson(object):
    is_better = True

In [128]:
guy = BetterPerson()

In [129]:
guy.is_better

True

In [130]:
guy.is_better = False

In [131]:
guy.is_better

False

In [132]:
other_guy = BetterPerson()

In [133]:
other_guy.is_better

True

In [134]:
class EvenBetterPerson(object):
    def __init__(self, firstname, lastname):
        self.firstname = firstname
        self.lastname = lastname

In [135]:
kirk_v2 = EvenBetterPerson("James T.", "Kirk")

In [136]:
kirk_v2.firstname, kirk_v2.lastname

('James T.', 'Kirk')

In [137]:
EvenBetterPerson()

TypeError: __init__() takes exactly 3 arguments (1 given)

In [142]:
class EvenBetterPersonWithPrint(object):
    def __init__(self, firstname, lastname):
        self.firstname = firstname
        self.lastname = lastname
    def print_me(self):
        #print "Person: {me.lastname}, {me.firstname}".format(me=self)
        print "Person: " + self.lastname + ", " + self.firstname

In [143]:
p = EvenBetterPersonWithPrint("Stephen", "Hawking")
p.print_me()

Person: Hawking, Stephen


In [144]:
p.firstname

'Stephen'

In [145]:
p.lastname

'Hawking'

In [146]:
p_prime = p

In [147]:
p_prime.lastname, p_prime.firstname

('Hawking', 'Stephen')

In [148]:
del p_prime

In [149]:
p_prime

NameError: name 'p_prime' is not defined

# Inheritance

In [154]:
class PersonWithFullName(EvenBetterPersonWithPrint):
    def get_full_name(self):
        return self.firstname + " " + self.lastname

In [155]:
p = PersonWithFullName("Donald", "Trump")

In [156]:
p.print_me()

Person: Trump, Donald


In [157]:
p.get_full_name()

'Donald Trump'

In [158]:
type(p)

__main__.PersonWithFullName

In [159]:
isinstance(p, EvenBetterPersonWithPrint)

True

# Web scraping

- Be nice.
- Follow the rules.
- Read the terms and conditions.
- Read the robots.txt.

In [162]:
html = open('example.html').read()

In [163]:
print html

<html>
    <head>
        <title>Test page</title>
    </head>
    <body>
        <h1 id="title">This is an example page</h1>
        <h2 id="list-header">This is a list of temperatures</h2>
        <ul>
            <li>2017-01-20: 5&deg;</li>
            <li>2017-01-21: 7&deg;</li>
            <li>2017-01-22: 4&deg;</li>
            <li>2017-01-23: 2&deg;</li>
        </ul>
        <h2>Links</h2>
        This is a <a href="example2.html">link</a> to a second page.
        <h2>Image</h2>
        This is a histogram (whose axes should be labeled!).
        <img src="histogram.png" width="50%" alt="Histogram of something.">
    </body>
</html>


In [164]:
from bs4 import BeautifulSoup

In [165]:
soup = BeautifulSoup(html, 'lxml')

In [166]:
soup.body

<body>\n<h1 id="title">This is an example page</h1>\n<h2 id="list-header">This is a list of temperatures</h2>\n<ul>\n<li>2017-01-20: 5\xb0</li>\n<li>2017-01-21: 7\xb0</li>\n<li>2017-01-22: 4\xb0</li>\n<li>2017-01-23: 2\xb0</li>\n</ul>\n<h2>Links</h2>\n        This is a <a href="example2.html">link</a> to a second page.\n        <h2>Image</h2>\n        This is a histogram (whose axes should be labeled!).\n        <img alt="Histogram of something." src="histogram.png" width="50%"/>\n</body>

In [167]:
soup.head

<head>\n<title>Test page</title>\n</head>

In [168]:
soup.li

<li>2017-01-20: 5\xb0</li>

In [169]:
soup('li')

[<li>2017-01-20: 5\xb0</li>,
 <li>2017-01-21: 7\xb0</li>,
 <li>2017-01-22: 4\xb0</li>,
 <li>2017-01-23: 2\xb0</li>]

In [170]:
soup.ul

<ul>\n<li>2017-01-20: 5\xb0</li>\n<li>2017-01-21: 7\xb0</li>\n<li>2017-01-22: 4\xb0</li>\n<li>2017-01-23: 2\xb0</li>\n</ul>

In [171]:
soup.ul.li

<li>2017-01-20: 5\xb0</li>

In [172]:
soup.li.text

u'2017-01-20: 5\xb0'

In [173]:
soup.img['src']

'histogram.png'

In [174]:
soup.img.name

'img'

In [175]:
soup.img.parent.name

'body'

In [176]:
soup.ul

<ul>\n<li>2017-01-20: 5\xb0</li>\n<li>2017-01-21: 7\xb0</li>\n<li>2017-01-22: 4\xb0</li>\n<li>2017-01-23: 2\xb0</li>\n</ul>

In [178]:
list(soup.ul.children)

[u'\n',
 <li>2017-01-20: 5\xb0</li>,
 u'\n',
 <li>2017-01-21: 7\xb0</li>,
 u'\n',
 <li>2017-01-22: 4\xb0</li>,
 u'\n',
 <li>2017-01-23: 2\xb0</li>,
 u'\n']

In [179]:
soup('h2')

[<h2 id="list-header">This is a list of temperatures</h2>,
 <h2>Links</h2>,
 <h2>Image</h2>]

In [180]:
soup('h2', {'id': 'list-header'})

[<h2 id="list-header">This is a list of temperatures</h2>]

In [182]:
soup('a')

[<a href="example2.html">link</a>]

# Scrapy

You will find the scraping examples at https://github.com/dhesse/stk_inf_scraping.

In [183]:
from urlparse import urljoin

In [186]:
urljoin('http://localhost:8888/files/example.html', 'example2.html')

'http://localhost:8888/files/example2.html'