# Testing

## Why testing

- Proove that your code works.
- Make modification easy.
- Document expected behaviour.
- Easy to do.
- Methodology.

In [None]:
from datetime import datetime
def ymd(date_string):
    """
    Example:
    
    >>> ymd('2016 01 30')
    datetime.datetime(2016, 1, 30, 0, 0)
    """
    format_string = '%Y %m %d'
    return datetime.strptime(date_string, format_string)

In [None]:
ymd('2016 01 30')

In [None]:
import unittest

In [None]:
class YMDTest(unittest.TestCase):
    def test_with_known_values(self):
        ymd_result = ymd('2016 01 30')
        known = datetime(2016, 1, 30, 0, 0)
        self.assertEqual(ymd_result, known)

# Running

    python -m unittest YMDTest

In [None]:
result = unittest.TestResult()
test = YMDTest('test_with_known_values')
test.run(result)
result

In [None]:
def run_test(cls, method_name):
    result = unittest.TestResult()
    test = cls(method_name)
    test.run(result)
    return result

In [None]:
run_test(YMDTest, 'test_with_known_values')

In [None]:
class YMDError(YMDTest):
    def test_throws_on_invalid_string(self):
        with self.assertRaises(ValueError):
            ymd('')
        with self.assertRaises(ValueError):
            # day out of range
            ymd_result = ymd('2016 02 30')

In [None]:
run_test(YMDError, 'test_throws_on_invalid_string')

# What you can test

- Pipelines
- Data prep & transformation
- Data ingestion
- ...

# What you can't test

- Prediction accuracy (within limits)
    - Disaster prevention is possible
- 'Statistical stuff'

In [None]:
class MyTests(unittest.TestCase):
    def test_stuff(self):
        self.assertAlmostEqual(0.00000000001, 0)
        self.assertEqual(1, 1)
        self.assertEqual([1,2], [1,2])
        self.assertEqual({1: 1}, {1: 1})
        
        self.assertTrue(1 == 1) # avoid
        self.assertTrue(1 in range(2)) # avoid
        self.assertIn(1, range(2)) # better
        
        self.assertTrue(1 < 2) # avoid
        self.assertLess(1, 2) # better

In [None]:
run_test(MyTests, 'test_stuff')

In [None]:
for i in dir(unittest.TestCase):
    if i.startswith('assert'):
        print i

In [None]:
import doctest

In [None]:
doctest.run_docstring_examples(ymd, globals(), True)

# Twitter

In [None]:
import requests
from requests_oauthlib import OAuth1

In [None]:
from donthackme import CONSUMER_KEY, CONSUMER_SECRET, TOKEN, TOKEN_SECRET

In [None]:
# api url
url = 'https://api.twitter.com/1.1/account/verify_credentials.json'

In [None]:
auth = OAuth1(CONSUMER_KEY, CONSUMER_SECRET, TOKEN, TOKEN_SECRET)

In [None]:
verify_response = requests.get(url, auth=auth)

In [None]:
verify_response

In [None]:
verify_response.headers

In [None]:
verify_response.json().keys()

In [None]:
verify_response.json()['name']

In [None]:
verify_response.json()['screen_name']

# Search

In [None]:
search_url = 'https://api.twitter.com/1.1/search/tweets.json'

In [None]:
params = {'q': 'data science',
          'result_type': 'recent'} # popular also possible

In [None]:
search_response = requests.get(search_url, params=params, auth=auth)

In [None]:
search_response

In [None]:
search_response.json()

In [None]:
re_json = search_response.json()

In [None]:
first_status = re_json.get('statuses', [{}])[0]
print first_status['text']

In [None]:
sorted(first_status.keys())

In [None]:
first_status['user'].keys()

In [None]:
# okay: re_json['statuses']
re_json.get('statuses', [{}])[0] #advanced

## Streaming API

In [None]:
from itertools import islice

In [None]:
r = requests.post('https://stream.twitter.com/1.1/statuses/filter.json',
                 params = {'track': '#data'},
                 auth=auth,
                 stream=True) # important

In [None]:
tweets = r.iter_lines()

In [None]:
import json

In [None]:
for tweet in islice(tweets, 20):
    if tweet != None:
        print json.loads(tweet)['text'][:20]
    else:
        print 'Timeout.'

In [None]:
r.close()

In [None]:
# student question: What about the u in u'string'?
print u'Hi, Håvard!'

In [None]:
u'Hi, Håvard!' # good

In [None]:
'Hi, Håvard!' # less good!

In [None]:
type(u'')

In [None]:
type('')

# Storing things

## Files (e.g. .csv)

- Don't scale (could use HDFS)
- What about JSON?

## Databases

- SQL
- NoSQL

# MongoDB

- Windows: Download 'Community edition'
- MAC
    brew update
    brew install mongodb
    brew services start mongobd
- Linux
    sudo apt install mongodb
    
Plus:

    pip install pymongo

Mongobdb stores BSON.

In [None]:
from pymongo import MongoClient

In [None]:
MongoClient('localhost', 27017)

In [None]:
MongoClient()

In [None]:
# stkinf - Database
# music - Collection
c = MongoClient().stkinf.music

In [None]:
c

In [None]:
jackson = {'name': {'first': 'Michael',
                    'last': 'Jackson',
                    'middle': 'Joseph'},
           'born': datetime(1958, 8, 29),
           'died': datetime(2009, 6, 25),
           'albums': [{'name': 'Thriller',
                       'released': 1982},
                      {'name': 'Bad',
                       'released': 1987}]}

In [None]:
result = c.insert_one(jackson)

In [None]:
result

In [None]:
result.acknowledged

In [None]:
result.inserted_id

In [None]:
c.find_one(result.inserted_id)

In [None]:
c.find_one() # any object

In [None]:
list(c.find())

In [None]:
for i in c.find():
    print i

In [None]:
for i in c.find().limit(10):
    print i

In [None]:
c.find_one({'born': datetime(1958, 8, 29)})

In [None]:
c.find_one({'name.middle': 'Joseph'})

In [None]:
c.find_one({'albums.released': 1982})

In [None]:
c.find_one({'albums.released': {'$gt': 1980}})

In [None]:
# AND
c.find_one({'name.first': 'Michael',
             'name.last': 'Bowie'})

In [None]:
# AND
c.find_one({'$or': [{'name.first': 'Michael'},
                     {'name.last': 'Bowie'}]})

In [None]:
bowie = {'name': {'first': 'David',
                  'last': 'Bowie',
                  'middle': 'Robert'},
         'born': datetime(1049, 1, 8)}

In [None]:
c.insert_one(bowie)

In [None]:
list(c.find({}, {'name.first': 1,
                 'born': 1}))

In [None]:
list(c.find({}, {'name.first': 1,
                 'born': 1,
                 '_id': 0}))

In [None]:
list(c.find({}, {'_id': 0}))

In [None]:
c.find_one({'born': {'$lt': datetime(2000, 1, 1)}, # AND
            'born': {'$gt': datetime(1900, 1, 1)}})

# Modifying things

In [None]:
result = c.update_one({'name.last': 'Bowie'},
                      {'$set': {'albums': []}})

In [None]:
result.acknowledged

In [None]:
c.find_one({'name.last': 'Bowie'})

In [None]:
c.update_one({'name.last': 'Bowie'},
             {'$push': {'albums': {'name': "Let's Dance",
                                   'released': 1983}}})

In [None]:
c.find_one({'name.last': 'Bowie'})

In [None]:
c.update_one({'name.last': 'Bowie'},
             {'$inc': {'albums.0.released': 1}})

In [None]:
c.find_one({'name.last': 'Bowie'})

In [None]:
r = c.update_one({'name.last': 'Bowie'},
                 {'$inc': {'albums.0.released': -1}})

## Aggregation

We'll talk more about this next session.

In [None]:
agg = c.aggregate([{'$group': {'_id': '$born',
                               'people_count': {'$sum': 1}}}])

In [None]:
list(agg)