# Dowloading multiple S275 files from state web-site

In [1]:
import urllib
import os
import urllib.request
import requests
from bs4 import BeautifulSoup

## Single file case

In [2]:
file_name ='C:/Users/jhernandez/Documents/S275/data_test.accdb'# change where this file gets dumped into 
url = 'http://www.k12.wa.us/safs/PUB/PER/1516/2015-2016_Final_S-275_Personnel_Database.accdb'

In [3]:
# Download the file from `url` and save it locally under `file_name`:
urllib.request.urlretrieve(url, file_name)

('C:/Users/jhernandez/Documents/S275/data_test.accdb',
 <http.client.HTTPMessage at 0x16b0d755c88>)

## Multiple file case using Beautiful Soup 

In [20]:
data_url = "http://www.k12.wa.us/safs/"
archive_url = "http://www.k12.wa.us/safs/db.asp"

### Create response object

In [21]:
r = requests.get(archive_url)

### create bs object

In [22]:
soup = BeautifulSoup(r.content,'html5lib')

#### Look at soup object same as "inspect element in chrome"

In [23]:
#print(soup)

<html xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:v="urn:schemas-microsoft-com:vml"><head>
<title>SAFS Access Databases and Excel Files</title>
<meta content="Superintendent of Public Instruction School Apportionment and Financial Services" name="title"/>
<meta content="This page provides links to the accounting manual published by School Financial Services for school districts in Washington State." name="description"/>
<meta content="State of Washington" name="originatorJurisdiction"/>
<meta content="Superintendent of Public Instructions" name="originatorDepartment"/>
<meta content="School Apportionment &amp; Financial Services" name="originatorOffice"/>
<meta content="current" name="timePeriodTextual"/>
<meta content="State government" name="govType"/>
<meta content="account manual, expenditure codes, revenue codes" name="keywords"/>
<meta content="Accounting Manual, School Financial Services" name="subjects"/>
<m

### Find all live links on web-site

In [24]:
links = soup.findAll('a')

#### only look at links that end with desired file type...in our case ".mdb" and ".accdb"...both access files

In [25]:
data_links = [data_url + link['href'] for link in links if link['href'].endswith(('.mdb', '.accdb'))]
print(data_links)
type(data_links)

['http://www.k12.wa.us/safs/INS/MIS/aF1951617.accdb', 'http://www.k12.wa.us/safs/INS/MIS/Item%20Number%20Multiple%20Year.mdb', 'http://www.k12.wa.us/safs/PUB/PER/1617/2016-2017_Preliminary_S-275_Personnel_Database.accdb', 'http://www.k12.wa.us/safs/PUB/PER/1617/2016-2017_Preliminary_S-275_Personnel_Database.accdb', 'http://www.k12.wa.us/safs/PUB/PER/1516/2015-2016_Final_S-275_Personnel_Database.accdb', 'http://www.k12.wa.us/safs/PUB/PER/1516/2015-2016_Final_S-275_Personnel_Database.accdb', 'http://www.k12.wa.us/safs/PUB/PER/1415/2014-2015S275FinalForPublic.accdb', 'http://www.k12.wa.us/safs/PUB/PER/1415/2014-2015S275FinalForPublic.accdb', 'http://www.k12.wa.us/safs/PUB/PER/1314/2013-2014S275FinalForPublic.mdb', 'http://www.k12.wa.us/safs/PUB/PER/1314/2013-2014S275FinalForPublic.mdb', 'http://www.k12.wa.us/safs/PUB/PER/1213/2012-2013S275FinalForPublic.mdb', 'http://www.k12.wa.us/safs/PUB/PER/1213/2012-2013S275FinalForPublic.mdb', 'http://www.k12.wa.us/safs/PUB/PER/1112/2011-2012S275Fina

list

## Looks like there are duplicate links:
### HTML table contains two link and its extracting all of them, the following is a quick fix..

In [None]:
l=()
i = set(data_links) 
for x in i:
    link = str(x)       
    l+=(x,)
print(l)

## Putting it all together by creating a function...

In [28]:
'''
URL of the OSPI web-page which provides link to
all the data files.
In this example, we first crawl the webpage to extract
all the links and then download the data.
'''
 
# specify the URL
data_url = "http://www.k12.wa.us/safs/"
archive_url = "http://www.k12.wa.us/safs/db.asp"

 
def get_data_links():
     
    # create response object
    r = requests.get(archive_url)
     
    # create beautiful-soup object
    soup = BeautifulSoup(r.content,'html5lib')
     
    # find all links on web-page
    links = soup.findAll('a')
 
    # filter the link sending with .mp4
    l_links = [data_url + link['href'] for link in links if link['href'].endswith(('.mdb', '.accdb'))]
    
    data_links=[]
    #remove the duplication...there must be a better way!
    i = set(l_links) 
    for x in i:
        link = str(x)       
        data_links+=(x,)
 
    return data_links
 

## Let's try to create a loop to do the job...

In [29]:
link = get_data_links()   
type(link)
get_data_links()

['http://www.k12.wa.us/safs/PUB/PER/1415/2014-2015S275FinalForPublic.accdb',
 'http://www.k12.wa.us/safs/PUB/PER/1112/2011-2012S275FinalForPublic.mdb',
 'http://www.k12.wa.us/safs/PUB/PER/0809/2008-2009S275FinalForPublic.mdb',
 'http://www.k12.wa.us/safs/PUB/PER/9596/1995-1996S275FinalForPublic.mdb',
 'http://www.k12.wa.us/safs/INS/MIS/Item%20Number%20Multiple%20Year.mdb',
 'http://www.k12.wa.us/safs/PUB/PER/1314/2013-2014S275FinalForPublic.mdb',
 'http://www.k12.wa.us/safs/PUB/PER/1213/2012-2013S275FinalForPublic.mdb',
 'http://www.k12.wa.us/safs/PUB/PER/0203/2002-2003S275FinalForPublic.mdb',
 'http://www.k12.wa.us/safs/PUB/PER/9899/1998-1999S275FinalForPublic.mdb',
 'http://www.k12.wa.us/safs/PUB/PER/0607/2006-2007S275FinalForPublic.mdb',
 'http://www.k12.wa.us/safs/PUB/PER/1516/2015-2016_Final_S-275_Personnel_Database.accdb',
 'http://www.k12.wa.us/safs/INS/MIS/aF1951617.accdb',
 'http://www.k12.wa.us/safs/PUB/PER/9900/1999-2000S275FinalForPublic.mdb',
 'http://www.k12.wa.us/safs/PU

In [30]:
#test = 'http://www.k12.wa.us/safs/PUB/PER/0405/2004-2005S275FinalForPublic.mdb'
#test.split('/')[-1]   
#'C:/Users/jhernandez/Documents/S275/'+test.split('/')[-1]
data_links = get_data_links()
type(data_links)

list

In [31]:
hd = 'C:/Users/jhernandez/Documents/S275/'
data_links = get_data_links()

for link in data_links:
 
        '''iterate through all links in data_links
        and download them one by one'''
         
        # 
        file_name = hd+link.split('/')[-1]   
 
        # 
        urllib.request.urlretrieve(link, file_name)