# Tools for Data Retrieval 

The goal is to present few Python tools to retrieve remote files/data.
<BR>
We will be covered:
<UL>
<LI> File Transfer Protocol (FTP)
<LI> WGET
<LI> JSON
</UL>

## FTP

<UL>
<LI> FTP (File Transfer Protocol) is a fast and convenient way to transfer files over the Internet. 
<LI> To make FTP work, you need a client (your machine) and a server (the machine to/from which you are putting/getting files).
</UL>

#### Basic ftp Session

In [None]:
import ftplib

ftp_server = "ftp.nluug.nl"
my_userid  = "anonymous"
my_passwd  = "ftplib-example-1"

ftp_session = ftplib.FTP(ftp_server)
ftp_session.login(my_userid, my_passwd)
 
ftp_session.quit()

In [51]:
def open_ftp_session(ftp_server, my_userid, my_passwd):
    """
       Purpose: Open a ftp session given the server ftp address,
                the user's ID and the user's password.
       
       Argument:
          - ftp_server: name of the ftp server (string)
          - my_userid:  user ID on the ftp server (string)
          - my_passwd:  user password on the ftp server (string)
    """
    
    ftp_session = ftplib.FTP(ftp_server)
    ftp_session.login(my_userid, my_passwd)
    
    return ftp_session

ftp_session = open_ftp_session(ftp_server, my_userid, my_passwd)

#### List Directories

In [52]:
def ftp_list_top_dirs(ftp_session):
    """
       Purpose: List the top directories on a ftp server
       
       Argument:
          - ftp_session: ftp session object
          
       Returned Value:
          - List of directories and files 
           (similar to the Unix command 'ls -l')
    """
    import ftplib
    
    ftp_session = ftplib.FTP(ftp_server)
    ftp_session.login(my_userid, my_passwd)
 
    data = []

    # Get the list of files
    ftp_session.dir(data.append)
    
    return data

In [None]:
data = ftp_list_top_dirs(ftp_session)
for line in data:
    print "-", line

#### Go to a Specific Directory

In [54]:
def ftp_dir_content(ftp_session, dir_name=None):
    """
       Purpose: List the content of a diirectory in a ftp server.
                If the directory is not provided, will list the content
                of the top directory.
       
       Argument:
          - ftp_session: ftp session object
          - dir_name:    name of the directory you want to access (string)
        
       Returned Value:
          - List of directories and files 
           (similar to the Unix command 'ls -l')
    """ 
 
    if dir_name != None:
        # Change directory
        ftp_session.cwd(dir_name)
    
    data = []

    # Get the list of files
    ftp_session.dir(data.append)

    return data

In [None]:
data = ftp_dir_content(ftp_session)
for line in data:
    print "-", line

In [None]:
data = ftp_dir_content(ftp_session, dir_name='pub')
for line in data:
    print "-", line

#### Download a File

In [57]:
import sys
 
def ftp_get_file(ftp_session, file_name):
    """
         Purpose: Get a file from a ftp server

         Arguments:
              - ftp_session: ftp session object
              - file_name: name of the file you want to download  
    """
    try:
        ftp_session.retrbinary("RETR " + file_name ,open(file_name, 'wb').write)
    except:
        print "Error - Cannot obtain file: "+ file_name

In [58]:
dir_name  = '/pub/'
file_name = 'README.nluug'

ftp_session.cwd(dir_name)   
ftp_get_file(ftp_session, file_name)

#### Uploading a File

In [None]:
import os
 
def ftp_put_file(ftp_session, file_name):
    """
         Purpose: Put a file to a ftp server

         Arguments:
              - ftp_session: ftp session object
              - file_name: name of the file you want to upload  
    """
    file_ext = os.path.splitext(file_name)[1]
    if file_ext in (".txt", ".htm", ".html"):
        ftp_session.storlines("STOR " + file_name, open(file_name))
    else:
        ftp_session.storbinary("STOR " + file, open(file_mane, "rb"), 1024)

In [None]:
ftp_put_file(ftp_session, "README.nluug")

In [None]:
ftp_session.quit()

## wget

<UL>
<LI> Command line utility for downloading files from internet.
<LI> It supports:
    <OL> 
    <LI> Downloading multiple files
    <LI> Downloading in the background 
    <LI> Resuming downloads
    <LI> Limiting the bandwidth used for downloads and viewing headers.
    </OL>
</UL>

In [None]:
import urllib
mylink = 'ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4.4.1.1.tar.gz'
testfile = urllib.URLopener()
testfile.retrieve(mylink, 'netcdf-4.4.1.1.tar.gz')

In [None]:
def wget_python(url_name, loc_file_name):
    “””
         Purpose: Implementation of wget

         Arguments:
              - url_name: url pointing to the remote file name
              - loc_file_name: local file name     
    “””
    import urllib
    testfile = urllib.URLopener()
    testfile.retrieve(my_url, loc_file_name)

## JSON

<UL>
<LI> JSON (JavaScript Object Notation) is a text based format for computers
to exchange data.
<LI> It is built on two structures:
   <OL>
	<LI> A collection of name/value pairs
	<LI> An ordered list of values.
   </OL>
<LI> JSON take these forms:
     <OL>
     <LI> <B>Objects</B>
          <UL>
          <LI> Unordered set of name/value pairs. 
	      <LI> Begins with { and ends with }. 
	      <LI> Each name is followed by : (colon) 
	      <LI> The name/value pairs are separated by , (comma).
          </UL>
     <LI> <B>Array</B>
          <UL>
          <LI> Ordered collection of values. 
	      <LI> Begins with [ and ends with ]. 
	      <LI> Values are separated by , (comma).
          </UL>
     <LI> <B>Value</B>
          <UL>
          <LI> A string in double quotes, number, or true or false or null,
	           or an object or an array.
          </UL>
     <LI> <B>String</B> 
          <UL>
          <LI> A sequence of zero or more Unicode characters, wrapped in double
	           quotes, using backslash escapes.
          </UL>
     <LI> <B>Number</B>
          <UL>
          <LI> Integer, Long, Float.
          </UL>
     </OL>
</UL>
<P>
<P>
An example of JSON data:

The json module enables you to convert between JSON and Python Objects.

#### Convert JSON to Python Object (Dict)

In [None]:
import json
json_data = '{"acronym": "BLD", "name": "Boulder Colorado", "latitude": 40.00, "longitude": -105.25}'
python_obj = json.loads(json_data)
print python_obj["name"]
print python_obj["acronym"]
print python_obj["latitude"]
print python_obj["longitude"]

#### Convert JSON to Python Object (List)

In [None]:
import json

array = '{"drinks": ["coffee", "tea", "water"]}'
data = json.loads(array)

for element in data["drinks"]:
    print element

#### Convert JSON to Python Object 

In [None]:
json_input = '{"stations": [{"acronym": "BLD", "name": "Boulder Colorado", "latitude": 40.00, "longitude": -105.25}, {"acronym": "BHD", "name": "Baring Head Wellington New Zealand", "latitude": -41.28, "longitude": 174.87}]}'

In [None]:
decoded = json.loads(json_input)
for x in decoded['stations']:
    print x["name"]

print json.dumps(decoded, sort_keys=True, indent=4)

#### Convert Python Object (Dict) to JSON

In [None]:
d = {}
d["name"] = "Boulder Colorado"
d["acronym”] = "BLD"
d["latitude”] = 40.00
d["longitude”] = -105.25
print json.dumps(d, ensure_ascii=False)

## Web Scraping with json

In [16]:
import urllib2
import json

url = "https://www.govtrack.us/data/congress/113/votes/2013/s11/data.json"
page = urllib2.urlopen(url).read()
data = json.loads(page)

In [None]:
# print the keys
for x in data:
      print x

In [None]:
# print the entire content
print json.dumps(data, sort_keys=True, indent=4)

In [None]:
# List the Yea votes
#-------------------
y_dem = 0
y_rep = 0
for x in data["votes"]["Yea"]:
      print x["first_name"], x["last_name"], '(',x["party"].strip(),')'
      if (x["party"] == "D"): y_dem += 1
      if (x["party"] == "R"): y_rep += 1
print
print 44*'-'
print 'The number of Democrats   who votes Yea: ', y_dem
print 'The number of Republicans who votes Yea: ', y_rep
print 44*'-'

#### Exercise

Write a script that provides:
<OL>
  <LI> The total number of Yea
  <LI> The total number of Nay
  <LI> The list (in alphabetical order) of all who voted and their party affiliation (last_name, first_name (party))
</OL>