<div id="toc"> </div>

# Setting up the notebook

In [6]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import numpy.random as rnd
import os

# to make this notebook's output stable across runs
rnd.seed(42)

# To plot figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [7]:
# Where to save the figures
PROJECT_ROOT_DIR = "."

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)


# Demonstrations

## Demo 1

In [8]:
import requests

We'll look to scrape from a website that has table and save it to a `.csv` file. We'll do a quick scrape of a [US Government public database](https://www.cia.gov/library/publications/the-world-factbook/fields/print_2085.html)

In [9]:
stats = requests.get("https://www.cia.gov/library/publications/the-world-factbook/fields/print_2085.html")

In [10]:
#Let's see if we got the content or not?
stats.status_code

200

In [11]:
#Printing content
stats.content

'\n        <!doctype html>\n\t<html>\n\t<head>\n        <script src=../js/jquery-1.8.3.min.js></script>\n\t<link rel=stylesheet type=text/css href=../styles/print.css>\n        </head>\n\t<body width=700px font-size=12px>\n        <div class=text-holder>\n        <div class=fbTitleRankOrder>FIELD LISTING :: <strong>ROADWAYS</strong></div>\n<div class=rankOrderDesc>This entry gives the <em>total</em> length of the road network and includes the length of the <em>paved</em> and <em>unpaved</em> portions.\n             <p class=comparison><br /><a alt=Country comparison to the world href=../rankorder/2085rank.html title=Country comparison to the world ><strong>Country Comparison to the World</strong></a></p></div>\n<table id=fieldListing>\n                 <tbody>\n                     <tr class=fieldHeading valign=middle bgcolor=#F8f8e7 height=25><th width=200px>Country</th><th>ROADWAYS(KM)</th></tr>\n<tr id=af><td class=country><a href=../geos/af.html>Afghanistan</td><td class=fieldData>

## Demo 3

**BS4 example:**

In [12]:
! pip install beautifulsoup4



In [13]:
from bs4 import BeautifulSoup

In [14]:
soup = BeautifulSoup(stats.content, 'html.parser')

In [15]:
# Randomly selecting a chunk of the html content
soup.prettify()

u'<!DOCTYPE doctype html>\n<html>\n <head>\n  <script src="../js/jquery-1.8.3.min.js">\n  </script>\n  <link href="../styles/print.css" rel="stylesheet" type="text/css"/>\n </head>\n <body font-size="12px" width="700px">\n  <div class="text-holder">\n   <div class="fbTitleRankOrder">\n    FIELD LISTING ::\n    <strong>\n     ROADWAYS\n    </strong>\n   </div>\n   <div class="rankOrderDesc">\n    This entry gives the\n    <em>\n     total\n    </em>\n    length of the road network and includes the length of the\n    <em>\n     paved\n    </em>\n    and\n    <em>\n     unpaved\n    </em>\n    portions.\n    <p class="comparison">\n     <br/>\n     <a alt="Country" comparison="" href="../rankorder/2085rank.html" the="" title="Country" to="" world="">\n      <strong>\n       Country Comparison to the World\n      </strong>\n     </a>\n    </p>\n   </div>\n   <table id="fieldListing">\n    <tbody>\n     <tr bgcolor="#F8f8e7" class="fieldHeading" height="25" valign="middle">\n      <th width

To separate out all the relevant tags, we need to taste the soup!

The list of cast is in the tag `<table class="cast_list">`

In [16]:
table = soup.find("table", attrs={'id':'fieldListing'})

In [17]:
rows = []
for row in table.find_all('tr'):
    rows.append([val.text.encode('utf8') for val in row.find_all('td')])

In [18]:
rows[1]

['Afghanistan',
 '\ntotal: 42,150 km\npaved: 12,350 km\nunpaved: 29,800 km (2006)\n']

## Demo 4

**Saving scapped data into a CSV file**

* We'll now save the raw data to a csv file

In [19]:
# Import csv package
import csv

In [20]:
with open('../data/facts.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows((row for row in rows if row))

Below are the first 5 rows in the saved csv file.

In [21]:
! cat ../data/facts.csv | head -5

Afghanistan,"
total: 42,150 km
paved: 12,350 km
unpaved: 29,800 km (2006)
"


There's lot more complex operations you can do with Beautiful Soup, but it would be better to move to `scrapy` Python package to be more productive and hassle free web scraping.

## Demo 5
    
**Query 1: _We want to get the general information of the user_**

In [22]:
import requests

In [23]:
url = "https://api.github.com/users/shwedosh"
request = requests.get(url)

In [24]:
# If 200 then we are doing it right!
request.status_code

200

In [25]:
request.json()

{u'avatar_url': u'https://avatars0.githubusercontent.com/u/22659624?v=4',
 u'bio': None,
 u'blog': u'',
 u'company': None,
 u'created_at': u'2016-10-06T12:13:50Z',
 u'email': None,
 u'events_url': u'https://api.github.com/users/shwedosh/events{/privacy}',
 u'followers': 6,
 u'followers_url': u'https://api.github.com/users/shwedosh/followers',
 u'following': 2,
 u'following_url': u'https://api.github.com/users/shwedosh/following{/other_user}',
 u'gists_url': u'https://api.github.com/users/shwedosh/gists{/gist_id}',
 u'gravatar_id': u'',
 u'hireable': None,
 u'html_url': u'https://github.com/shwedosh',
 u'id': 22659624,
 u'location': None,
 u'login': u'shwedosh',
 u'name': u'Shweta Doshi',
 u'organizations_url': u'https://api.github.com/users/shwedosh/orgs',
 u'public_gists': 0,
 u'public_repos': 17,
 u'received_events_url': u'https://api.github.com/users/shwedosh/received_events',
 u'repos_url': u'https://api.github.com/users/shwedosh/repos',
 u'site_admin': False,
 u'starred_url': u'ht

**Query 2: _We want to get the number of public repositories the user has_**

In [26]:
url = "https://api.github.com/users/karpathy/repos"
request = requests.get(url)

In [27]:
request.status_code

200

In [28]:
repositories = request.json()

In [29]:
repositories

[{u'archive_url': u'https://api.github.com/repos/karpathy/arxiv-sanity-preserver/{archive_format}{/ref}',
  u'assignees_url': u'https://api.github.com/repos/karpathy/arxiv-sanity-preserver/assignees{/user}',
  u'blobs_url': u'https://api.github.com/repos/karpathy/arxiv-sanity-preserver/git/blobs{/sha}',
  u'branches_url': u'https://api.github.com/repos/karpathy/arxiv-sanity-preserver/branches{/branch}',
  u'clone_url': u'https://github.com/karpathy/arxiv-sanity-preserver.git',
  u'collaborators_url': u'https://api.github.com/repos/karpathy/arxiv-sanity-preserver/collaborators{/collaborator}',
  u'comments_url': u'https://api.github.com/repos/karpathy/arxiv-sanity-preserver/comments{/number}',
  u'commits_url': u'https://api.github.com/repos/karpathy/arxiv-sanity-preserver/commits{/sha}',
  u'compare_url': u'https://api.github.com/repos/karpathy/arxiv-sanity-preserver/compare/{base}...{head}',
  u'contents_url': u'https://api.github.com/repos/karpathy/arxiv-sanity-preserver/contents/{+p

## Demo 6

**Querying the Twitter REST API**
- We'll be using `tweepy` package to navigate through the streaming API.
- We'll need to use credentials for the Twitter App that we creating in the Pre-Reading section.

**Query 1: _We need to get all the tweets for a specific topic_**

In [30]:
! pip install tweepy

Collecting tweepy
  Downloading tweepy-3.5.0-py2.py3-none-any.whl
Collecting requests-oauthlib>=0.4.1 (from tweepy)
  Downloading requests_oauthlib-0.8.0-py2.py3-none-any.whl
Collecting oauthlib>=0.6.2 (from requests-oauthlib>=0.4.1->tweepy)
  Downloading oauthlib-2.0.3.tar.gz (127kB)
[K    100% |████████████████████████████████| 133kB 490kB/s ta 0:00:01
[?25hBuilding wheels for collected packages: oauthlib
  Running setup.py bdist_wheel for oauthlib ... [?25ldone
[?25h  Stored in directory: /home/gyanendra/.cache/pip/wheels/92/a1/de/e81416b06ac105b68881838c777a89b456e953f543db72464b
Successfully built oauthlib
Installing collected packages: oauthlib, requests-oauthlib, tweepy
Successfully installed oauthlib-2.0.3 requests-oauthlib-0.8.0 tweepy-3.5.0


In [31]:
#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

At this point we need to get all the required tokens that the Twitter App generated for us when we created it.

In [32]:
access_token = "ENTER YOUR ACCESS TOKEN"
access_token_secret = "ENTER YOUR ACCESS TOKEN SECRET"
consumer_key = "ENTER YOUR API KEY"
consumer_secret = "ENTER YOUR API SECRET"

In [33]:
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):

    def on_data(self, data):
        print ("The data collected is \n {}".format(data))
        return True

    def on_error(self, status):
        print ("The status is {}".format(status))

In [34]:
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = tweepy.Stream(auth, l)

#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['football'])

NameError: name 'tweepy' is not defined

(_Note: You need to interrupt the kernel if running in notebook, it would otherwise continue running._)

**Query 2: _We need to get the tweets from a specific user._**

In [None]:
# We already have authorized the Twitter App
api = tweepy.API(auth)

#initialize a list to hold all the tweepy Tweets
alltweets = []    

#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)

#save most recent tweets
alltweets.append(new_tweets)

# We'll be able to download max 200 tweets, there a non-trivial way in which you could download the rest!

## Demo 7

**Parsing CSV responses**
- CSV responses. If the API responses are in the CSV Format we take the help `csv` package to navigate through it. 

In [None]:
# We'll consider the Quandl API for our purposes
import requests
url = "https://www.quandl.com/api/v3/datasets/WIKI/AAPL.csv"
request = requests.get(url)

In [None]:
request.status_code

In [None]:
datacsv = str(request.content)

In [None]:
rows = [val for val in datacsv.split('\\n')]

In [None]:
# Sample row printed from the response recieved
rows[1]

## Demo 8

**Parsing JSON responses**
    
* We'll consider the Github API to understand the JSON Response Parsing

In [None]:
url = "https://api.github.com/users/karpathy"
request = requests.get(url)

In [None]:
# Dumping the JSON Response in a variable
userDetails = request.json()

In [None]:
# Printing out all the user details
userDetails

## Demo 9

In [None]:
# Getting the Name and Location of the user
print ("The name of the repository owner is {}.".format(userDetails.get('name')))
print ("The location of the repository owner is {}.".format(userDetails.get('location')))

## Demo 10

**Worked out example Reading YAML files into Python (Code snippet below):**

In [None]:
! pip install pyyaml

In [None]:
import yaml

In [None]:
# For demonstration purposes we would import a sample YAML file
with open("./data/sample.yml", 'r') as stream:
    try:
        sample = (yaml.load(stream))
    except yaml.YAMLError as exc:
        print(exc)

In [None]:
sample

## Demo 11

In [None]:
class FlatDict:        
    def flatDict(self, dictObj=None):
        '''Flatten a given dict
        '''
        #print('Arg received: ', dictObj)
        for key, value in dictObj.items():
            #print('Now iterating through: ', {key:value})
            if isinstance(value, dict):
                #print('Value: ', value, ', Is value a dictionary? ', isinstance(value, dict))
                for key2, value2 in value.items():
                    self.flatDict({'_'.join([key, key2]) : value2})
            elif isinstance(value, list) and isinstance(value[0], str):
                value = ', '.join(value)
                #print('The pair to be updated: ', {key:value})
                self.flatteneddict.update({key:value})
            else:
                #print('The pair to be updated: ', {key:value})
                self.flatteneddict.update({key:value})
        
    
    def __init__(self, dictObj=None):
        self.flatteneddict = {}
        if not isinstance(dictObj, dict):
            raise ValueError('Expected a dictionary object as input!')
        self.flatDict(dictObj)
    
    
    def __repr__(self):
        return(str(self.flatteneddict))

In [None]:
FlatDict(sample)

In [None]:
import pandas as pd
pd.set_option("display.max_columns", 101)

class CricDF(FlatDict):
    
    def __init__(self,  dictObj=None):
        super().__init__(dictObj)
        self.info = dictObj["info"]
#         print(self.flatteneddict)
        self.ballsDF = pd.DataFrame()

    def get_ballsDF(self):  
        for idx, inningsObj in enumerate(self.flatteneddict["innings"]):# idx = 0, 1; inningsObj = {'ist ininnings': dict}
            inningsDict = list(inningsObj.values())[0]                  # inningsDict = {'team': val, 'deliveries': dict}
            for ball in inningsDict['deliveries']:                      # a dict
                self.flatteneddict = {}                                 # clear out details of last delivery
                self.flatteneddict.update({'innings': idx + 1})
                self.flatteneddict.update({'batting_team': inningsDict['team']})
                self.flatDict(self.info)
#                 print(self.flatteneddict)
                
#                 print('Ball: ', ball)
                
                for ball_no, ball_details in ball.items():
#                     print('ball_no: ', ball_no, 'ball_details: ', ball_details)
                    self.flatDict(ball_details)
                    idx_df = int(1000*(idx+1) + 10*ball_no)
                    newDF = pd.DataFrame(self.flatteneddict, index=[idx_df])
                    self.ballsDF = pd.concat([self.ballsDF, newDF])
                    
        cols = ['competition', 'gender', 'match_type', 'dates','city', 'umpires', 'venue', 'teams',
                'toss_winner', 'toss_decision', 'outcome_by_runs', 'outcome_winner', 'player_of_match', 
                'innings', 'batting_team', 'batsman', 'non_striker', 'bowler', 'overs', 
                'runs_batsman', 'runs_extras', 'extras_byes', 'extras_legbyes', 
                'extras_wides', 'runs_total', 'wicket_fielders', 'wicket_kind', 'wicket_player_out']
        
        self.ballsDF = self.ballsDF[cols]

- A class to read `Cricsheet` YAMLs and produce CSVs

## Demo 12

In [None]:
z = CricDF(file)
z.get_ballsDF()
z.ballsDF