# Twitter

In [None]:
import requests
from requests_oauthlib import OAuth1

In [None]:
from donthackme import CONSUMER_KEY, CONSUMER_SECRET, TOKEN, TOKEN_SECRET

In [None]:
# api url
url = 'https://api.twitter.com/1.1/account/verify_credentials.json'

In [None]:
auth = OAuth1(CONSUMER_KEY, CONSUMER_SECRET, TOKEN, TOKEN_SECRET)

In [None]:
verify_response = requests.get(url, auth=auth)

In [None]:
verify_response

In [None]:
verify_response.headers

In [None]:
verify_response.json().keys()

In [None]:
verify_response.json()['name']

In [None]:
verify_response.json()['screen_name']

# Search

In [None]:
search_url = 'https://api.twitter.com/1.1/search/tweets.json'

In [None]:
params = {'q': 'data science',
          'result_type': 'recent'} # popular also possible

In [None]:
search_response = requests.get(search_url, params=params, auth=auth)

In [None]:
search_response

In [None]:
search_response.json()

In [None]:
re_json = search_response.json()

In [None]:
first_status = re_json.get('statuses', [{}])[0]
print first_status['text']

In [None]:
sorted(first_status.keys())

In [None]:
first_status['user'].keys()

In [None]:
# okay: re_json['statuses']
re_json.get('statuses', [{}])[0] #advanced

## Streaming API

In [None]:
from itertools import islice

In [None]:
r = requests.post('https://stream.twitter.com/1.1/statuses/filter.json',
                 params = {'track': '#data'},
                 auth=auth,
                 stream=True) # important

In [None]:
tweets = r.iter_lines()

In [None]:
import json

In [None]:
for tweet in islice(tweets, 20):
    if tweet != None:
        print json.loads(tweet)['text'][:20]
    else:
        print 'Timeout.'

In [None]:
r.close()

In [None]:
# student question: What about the u in u'string'?
print u'Hi, Håvard!'

In [None]:
u'Hi, Håvard!' # good

In [None]:
'Hi, Håvard!' # less good!

In [None]:
type(u'')

In [None]:
type('')

# Storing things

## Files (e.g. .csv)

- Don't scale (could use HDFS)
- What about JSON?

## Databases

- SQL
- NoSQL

# MongoDB

- Windows: Download 'Community edition'
- MAC
    brew update
    brew install mongodb
    brew services start mongobd
- Linux
    sudo apt install mongodb
    
Plus:

    pip install pymongo

Mongobdb stores BSON.

In [None]:
from datetime import datetime

In [None]:
from pymongo import MongoClient

In [None]:
MongoClient('localhost', 27017)

In [None]:
MongoClient()

In [None]:
# stkinf - Database
# music - Collection
c = MongoClient().sb1.music

In [None]:
c

In [None]:
jackson = {'name': {'first': 'Michael',
                    'last': 'Jackson',
                    'middle': 'Joseph'},
           'born': datetime(1958, 8, 29),
           'died': datetime(2009, 6, 25),
           'albums': [{'name': 'Thriller',
                       'released': 1982},
                      {'name': 'Bad',
                       'released': 1987}]}

In [None]:
result = c.insert_one(jackson)

In [None]:
result

In [None]:
result.acknowledged

In [None]:
result.inserted_id

In [None]:
c.find_one(result.inserted_id)

In [None]:
c.find_one() # any object

In [None]:
list(c.find())

In [None]:
for i in c.find():
    print i

In [None]:
for i in c.find().limit(10):
    print i

In [None]:
c.find_one({'born': datetime(1958, 8, 29)})

In [None]:
c.find_one({'name.middle': 'Joseph'})

In [None]:
c.find_one({'albums.released': 1982})

In [None]:
c.find_one({'albums.released': {'$gt': 1980}})

In [None]:
# AND
c.find_one({'name.first': 'Michael',
             'name.last': 'Bowie'})

In [None]:
# OR
c.find_one({'$or': [{'name.first': 'Michael'},
                     {'name.last': 'Bowie'}]})

In [None]:
bowie = {'name': {'first': 'David',
                  'last': 'Bowie',
                  'middle': 'Robert'},
         'born': datetime(1949, 1, 8)}

In [None]:
c.insert_one(bowie)

In [None]:
list(c.find({}, {'name.first': 1,
                 'born': 1}))

In [None]:
list(c.find({}, {'name.first': 1,
                 'born': 1,
                 '_id': 0}))

In [None]:
list(c.find({}, {'_id': 0}))

In [None]:
c.find_one({'born': {'$lt': datetime(2000, 1, 1)}, # AND
            'born': {'$gt': datetime(1900, 1, 1)}})

# Modifying things

In [None]:
result = c.update_one({'name.last': 'Bowie'},
                      {'$set': {'albums': []}})

In [None]:
result.acknowledged

In [None]:
c.find_one({'name.last': 'Bowie'})

In [None]:
c.update_one({'name.last': 'Bowie'},
             {'$push': {'albums': {'name': "Let's Dance",
                                   'released': 1983}}})

In [None]:
c.find_one({'name.last': 'Bowie'})

In [None]:
c.update_one({'name.last': 'Bowie'},
             {'$inc': {'albums.0.released': 1}})

In [None]:
c.find_one({'name.last': 'Bowie'})

In [None]:
r = c.update_one({'name.last': 'Bowie'},
                 {'$inc': {'albums.0.released': -1}})

## Aggregation

We'll talk more about this next session.

In [None]:
agg = c.aggregate([{'$group': {'_id': '$born',
                               'people_count': {'$sum': 1}}}])

In [None]:
list(agg)

# Loading data into Pandas

In [None]:
import pandas as pd

In [None]:
pd.io.json.json_normalize(list(c.find({}, {'_id': 0})))

# Web Scraping

## Before we start

- Web scraping can lead to messy data.
- Read websites' terms and conditions.
- Respect websites' robots.txt.
- Be nice!
- The firefox inspector is you friend!

# HTML 101

Wepbages are written in HTML. You can read more on the [w3 html tutorial](http://www.w3schools.com/html/).

We will work with a simple [example page](example.html).

In [None]:
html = open("example.html").read()

In [None]:
print html

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(html, "lxml")

In [None]:
soup.body

In [None]:
soup.body.ul("li")

In [None]:
soup("li")

In [None]:
soup('img')

In [None]:
soup.img

In [None]:
for img in soup("img"):
    print img.attrs

In [None]:
soup.img['src']

In [None]:
soup.li.parent

In [None]:
soup.li.parent.name

In [None]:
soup.li.parent.text

In [None]:
soup.ul.contents

In [None]:
soup.ul.children

In [None]:
list(soup.ul.children)

In [None]:
soup(lambda tag: tag.has_attr('alt'))

# Scrapy

In [None]:
import scrapy

In [None]:
from urlparse import urljoin
class TestSpider(scrapy.Spider):
    name = 'test_spider'
    start_urls = ['http://localhost:8888/files/example.html']
    # can be done from command line
    custom_settings = {'FEED_FORMAT': 'json',
                       'FEED_URI': 'result.json'}
    
    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        for list_item in soup('li'):
            yield {'list_item': list_item.text,
                   'url': response.url}
        for link in soup('a'):
            target = urljoin(response.url, link['href'])
            yield scrapy.Request(target, callback=self.parse)

# Running

The spider could now be run using

    scrapy runspider test_spider
   
or

    scrapy runspider --output=results.csv test_spider

Also try

    scrapy shell

In [None]:
from scrapy.crawler import CrawlerProcess

In [None]:
process = CrawlerProcess()

In [None]:
process.crawl(TestSpider)

In [None]:
process.start()