# In the grand scheme of notebooks, this one comes first.
## Purpose of this notebook
1. Take the data.sa.gov.au csv file
1. Download the SET of distinct URLs from TAFE SA site
1. Download the SET of distinct URLs from training.gov.au
1. Parse saved training.gov.au HTML files for ASCED narrow FOE data and attach to each course
1. Generate a JSON file containing all required information

# Following this notebook, go to TAFE Files and Course Types and parse TAFE HTML files

# 1. Import libraries, define input filename, file directory, training.gov.au base url, output directories for HTML files downloaded from training.gov.au and TAFE SA websites

In [176]:
from bs4 import BeautifulSoup
from copy import deepcopy
from csv import reader, writer
from pprint import pprint

import json
import os
import re
import requests


wd = './tft'
CSV = os.path.join(wd, 'tafe-sa-qualifications---semester-2-2016.csv')
CSV_OUTPUT_FILE = 'new_tafe.csv'
JSON_OUTPUT_FILE = 'tafe-sa-qualifications1.json'
TRAINING_BASE_URL = 'http://training.gov.au/Training/Details/'
TAFE_BASE_URL = 'http://www.tafesa.edu.au/xml/course/aw/'
TF_HTML_DIR = os.path.join(wd, "tf_html/")
TR_HTML_DIR = os.path.join(wd, "tr_html/")

os.chdir(wd)

# 2. Define helper functions

In [164]:
def flatmap(array):
    """
    Given a two-dimensional list
    Returns a flattened single-dimension list"""
    return [a for b in array for a in b]

def frequency(array):
    """
    Given a LIST
    Returns a DICTIONARY with list items as keys and frequency as values
    """
    count = {}
    for instance in array:
        fin = instance['url'] 
        if fin in count:
            count[fin] += 1
        else:
            count[fin] = 1
    return count

def index_of_dict(obj, index):
    """
    Given an object and an arbitrary index,
    Returns a dictionary representation of that object at that index. 
    Index is determined by sorting DICTIONARY keys (as strings).
    """
    as_list = sorted(list(obj.keys()))
    key = as_list[index]
    return {
        key: obj[key]
    }

def dict_to_list(dictionary):
    array = []
    for key in dictionary:
        array.append({key: dictionary[key]})
        
    return array

# 3. Define functions to parse data.sa.gov.au CSV file

In [109]:
def csv_to_dict(csv_file):
    """
    Takes a file path to a csv file with a specific structure and assumes
    a header row. It depends upon csv.reader and returns a list of dictionaries
    where the keys are the file's header row.
    """
    courses = []

    with open(csv_file, 'r') as f:
        read_file = reader(f)
        headers = next(read_file)
        print(headers)

        for line_index, line in enumerate(read_file):
            course = {}
            course['_id'] = line_index + 1
            
            for index, key in enumerate(headers):
                course[key] = line[index].strip()
            courses.append(course)
    
    return courses
        
    
def add_training_url_to_courses(courses, base_url):
    for course in courses:
        course['training_url'] = base_url + course['National Code']
    return courses
       

def add_tafe_filename_to_courses(courses, base_url):
    start = len(base_url)
    for course in courses:
        course['tafe_filename'] = course['URL'][start:] + '.html'
    return courses

    
def parse_tafe_csv_to_dict(csv_file, training_base_url, tafe_base_url):
    """
    Given a CSV file 'csv_file'
    Calls csv_to_json THEN add_training_url_to_courses THEN add_tafe_filename_to_courses
    Returns a list of course objects
    """
    courses = csv_to_dict(csv_file)
    add_training_url_to_courses(courses, training_base_url)
    add_tafe_filename_to_courses(courses, tafe_base_url)
    return courses


# 4. Define URL definition functions

In [151]:
def get_list_of_set_of_training_urls(courses, base_url):
    """
    Given a list of courses,
    finds a unique set of course['National Code'] and
    Returns a list of dictionaries with keys 'National Code' and 'training_url'
    """
    national_codes = [course['National Code'] for course in courses]
    unique_set = list(set(national_codes))
    unique_urls = [{
            'National Code':code, 
            'training_url':base_url + code
        } for code in unique_set]
    
    return unique_urls


def get_list_of_set_of_tafe_urls(courses, base_url):
    """
    Given a list of courses and a base url,
    Returns a list of dictionaries with keys 'url' and 'tafe_filename' distinct on 
    course['tafe_filename'] (which is one-to-one derived from URLs in CSV file)
    """
    
    sorted_set = sorted(list(set([c['tafe_filename'] for c in courses])))
    
    return [{
            'url': base_url + filename[:-5] + '.aspx',
            'tafe_filename': filename
        } for filename in sorted_set]


def inverse_map_url_to_course(courses):
    """
    Given a LIST of courses
    Returns a DICTIONARY with unique tafe_codes as keys and a LIST of _ids as values
    """
    files = {}
    for course in courses:
        try:
            tafe_code = course['tafe_filename'][:-5]
        except KeyError as k:
            print("'tafe_filename'key not present in course.")
            raise Exception
        try:
            _id = course['_id']
        except KeyError as k:
            print("'_id' key not present in course")
        if tafe_code not in files:
            files[tafe_code] = []
            
        # make sure list is unique
        if course['_id'] not in files[tafe_code]:
            files[tafe_code].append(course['_id'])
            
    # sort everything
    for f in files:
        files[f] = sorted(files[f])
    return files

# 5. Generate list of courses and lists of URLs

In [111]:
tafe_csv_courses = parse_tafe_csv_to_dict(CSV, TRAINING_BASE_URL, TAFE_BASE_URL)

tf_urls = get_list_of_set_of_tafe_urls(tafe_csv_courses, TAFE_BASE_URL)
tr_urls = get_list_of_set_of_training_urls(tafe_csv_courses, TRAINING_BASE_URL)

['Course Name', 'National Code', 'Campus', 'Semester', 'Duration Number', 'Duration Scale', 'Study Mode', 'Study Place', 'URL']


# 6. Define functions that download HTML files from training.gov.au and TAFE SA

In [165]:
def not_in_directory(filename, directory):
    previous_directory = os.getcwd()
    new_directory = os.chdir(directory)
    filename_in_directory = False
    
    if filename in os.listdir(new_directory):
        filename_in_directory = True
    os.chdir(previous_directory)
    return not(filename_in_directory)
        

def request_generator_comprehension_for_training_urls(urls, directory):
    """
    Takes a list of dictionaries with keys: 'training_url' and 'National Code'
    returns a generator comprehension that when called downloads the url provided
    and returns a dictionary with keys: 'training_url', 'filename' and 'body
    """
    return ({
        'training_url':url['training_url'], 
        'filename': url['National Code'] + '.html', 
        'body': requests.get(url['training_url'], stream=True).text.strip()
        } for url in urls if not_in_directory(url['National Code'] + '.html', directory))


def get_list_of_html_files_in_current_directory():
    return [f for f in os.listdir(os.getcwd()) if f[-5:] == '.html']


def download_tf_htmls(iterator_of_requests, directory):
    previous_directory = os.getcwd()
    new_directory = os.chdir(directory)
    print("Changing directories from " + previous_directory + " to " + directory)
    
    html_files_in_directory = get_list_of_html_files_in_current_directory()
    starting_number_of_html_files_in_directory = len(html_files_in_directory)
    
    i = 0
    for r in iterator_of_requests:
        i += 1
        filename = r['filename']
        
        if filename not in html_files_in_directory:
            with open(filename, 'w') as f:
                body = r['body']
                f.write(body)
                print("("+ str(i) + ")", "Written", filename, "to disk")
    
    final_html_files_in_directory = get_list_of_html_files_in_current_directory()
    final_number_of_html_files_in_directory = len(final_html_files_in_directory)
    number_of_files_saved = final_number_of_html_files_in_directory - starting_number_of_html_files_in_directory
    print(str(number_of_files_saved) + " files saved. Number of files in directory " + str(final_number_of_html_files_in_directory))
    
    print("Returning to " + previous_directory)
    os.chdir(previous_directory)

            
def request_generator_comprehension_for_tafe_pages(urls, directory):
    """
    Takes a list of dictionaries with keys: 'training_url' and 'National Code'
    returns a generator comprehension that when called downloads the url provided
    and returns a dictionary with keys: 'training_url', 'filename' and 'body
    """
    return ({
        'tafe_url':url['url'], 
        'filename': url['tafe_filename'], 
        'body': requests.get(url['url'], stream=True).text.strip()
        } for url in urls if not_in_directory(url['tafe_filename'], directory))


## Download training.gov.au pages

In [79]:
rs = request_generator_comprehension_for_training_urls(tr_urls, TR_HTML_DIR)
download_tf_htmls(rs, TR_HTML_DIR)

Changing directories from /Users/emilkloeden/Documents/Programming/Python/iPython Notebooks/tft/tr_html to /Users/emilkloeden/Documents/Programming/Python/iPython Notebooks/tft/tr_html/
0 files saved. Number of files in directory 393
Returning to /Users/emilkloeden/Documents/Programming/Python/iPython Notebooks/tft/tr_html


## Download TAFE SA pages

In [82]:
tf_urls = get_list_of_set_of_tafe_urls(tafe_courses, TAFE_BASE_URL)

rs = request_generator_comprehension_for_tafe_pages(tf_urls, TF_HTML_DIR)
download_tf_htmls(rs, TF_HTML_DIR)

{'Course Name': 'Advanced Diploma of Accounting', 'Study Place': 'On & Off Campus', 'Duration Scale': 'months', 'URL': 'http://www.tafesa.edu.au/xml/course/aw/aw_TP00726', 'Duration Number': '18', 'Semester': 'Semester 2, 2016', 'National Code': 'FNS60215', 'tafe_filename': 'aw_TP00726.html', 'Campus': 'Adelaide City', '_id': 1, 'Study Mode': 'Full Time & Part Time', 'training_url': 'http://training.gov.au/Training/Details/FNS60215'}
{'url': 'http://www.tafesa.edu.au/xml/course/aw/aw_AC00004.aspx', 'tafe_filename': 'aw_AC00004.html'}
407


# 7.  Parse training.gov.au files to get ASCED code for each National code

In [97]:
def parse_tr_html(filename):
    """
    Take a file from training.gov.au (saved to a local location) and return a dictionary
    containing ANZSCO, ASCED and level values
    """
    course = {}
    errors = []

    course['National Code'] = filename[:-5] # 'drop the '.html'
    
    with open(filename, 'r') as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    # declare an empty list so that if the try statement fails
    # the loop below will do nothing
    classifications = [] 
    try:
        div = soup.find('div', id='tableClassifications')
        
        table = div.find('table')
        table_body = table.find('tbody')
        classifications = table_body.find_all('td')
    except AttributeError:
        print(filename, "threw an Attribute error. Check it manually.")
        errors.append({'filename': filename, 'error': 'AttributeError'})

    

    for index, clf in enumerate(classifications):
        if clf.text.strip().startswith('ANZSCO'):
            course['ANZSCO'] = {
                'code': classifications[index + 1].text.strip(), 
                'description': classifications[index + 2].text.strip()
            } 
        elif clf.text.strip().startswith('ASCED'):
            course['ASCED'] = {
                'code': classifications[index + 1].text.strip(), 
                'description': classifications[index + 2].text.strip()
            }
        elif clf.text.strip().startswith('Qualification/Course'):
            course['level'] = {
                'code': classifications[index + 1].text.strip(), 
                'description': classifications[index + 2].text.strip()
            }
    
    return {'course':course, 'errors': errors}
    

def parse_list_of_tr_html(directory):
    """
    Parse a directory of training.gov.au html files
    """
    previous_directory = os.getcwd()
    new_directory = os.chdir(directory)
    output = []
    what = []

    for f in os.listdir():
        file_output = parse_tr_html(f)
        if file_output:
            output.append(file_output)
        else:
            what.append(f)
            
    
    courses = [o['course'] for o in output]
    errors = [o['errors'] for o in output]
    
    return {'courses':courses, 'errors':errors, 'what': what}

## Run parsing functions on training.gov.au files

In [98]:
# Because i don't know how to do destructuring in python
tr_output = parse_list_of_tr_html(TR_HTML_DIR)

training_courses = tr_output['courses']
training_errors = flatmap(tr_output['errors']) #[subitem for item in tr_output['errors'] for subitem in item]
training_what = tr_output['what']

print(len(training_courses))
print(training_errors)
print(training_what)

10251NAT.html threw an Attribute error. Check it manually.
10252NAT.html threw an Attribute error. Check it manually.
10253NAT.html threw an Attribute error. Check it manually.
10254NAT.html threw an Attribute error. Check it manually.
10255NAT.html threw an Attribute error. Check it manually.
10257NAT.html threw an Attribute error. Check it manually.
10260NAT.html threw an Attribute error. Check it manually.
10262NAT.html threw an Attribute error. Check it manually.
10263NAT.html threw an Attribute error. Check it manually.
10266NAT.html threw an Attribute error. Check it manually.
10267NAT.html threw an Attribute error. Check it manually.
10268NAT.html threw an Attribute error. Check it manually.
10269NAT.html threw an Attribute error. Check it manually.
10270NAT.html threw an Attribute error. Check it manually.
40649SA.html threw an Attribute error. Check it manually.
40650SA.html threw an Attribute error. Check it manually.
CRS1400029.html threw an Attribute error. Check it manuall

# 8. Define functions that will parse saved .html files from TAFE SA website

In [99]:
def get_element_by_string(soup, element_type, string):
    """
    Given a soup, a string indicating an HTML element type 'type' and a keyword 'string'
    Returns a bs4.tag element representing the first HTML element of type 'type' 
    containing text matching '%string%'
    """
    return soup.find(element_type, string=re.compile(string))


def get_course_types_no_try(soup):
    """
    Given a soup,
    Returns a LIST of course types
    """
    return [el.text.strip() for el in soup.find(id="course-type").find_all("li")]


def get_admission_requirements_no_try(soup):
    """
    Given a soup,
    Returns a LIST of Course Admission Requirements
    """
    # find a very specific div
    div = soup.find('div', class_ = 'course_table_box')

    # find the first h3 element inside the div
    h3 = div.find('h3')

    # find and return all li elements next to the h3
    return [el.text.strip() for el in h3.next_sibling.find_all('li')]


def get_TAFE_SA_code_no_try(soup):
    """
    Given a soup,
    Returns a STRING stating the course code TAFE gives on the page
    """
    table = soup.find("table", "course_summary")
    key = table.find("th", string=re.compile("TAFE SA Code"))
    value = key.next_sibling.text.strip()

    return value


def get_course_notes_no_try(soup):
    """
    Given a soup,
    Returns a LIST of course notes
    """
    header = get_element_by_string(soup, "h2", "Course Notes")
    notes_element = header.next_sibling
    notes = [el.text.strip() for el in notes_element.find_all('li')]

    return notes
    

def parse_file(filename):
    """
    Given a filename and assuming the file exists in the current directory,
    Returns a dictionary containing the following keys:
    * filename
    * national_code
    [* TAFE_SA_code]
    [* course_notes]
    [* course_types]
    [* admission_requirements]
    """
    with open(filename, 'r') as f:
        course = {}
        course['filename'] = filename
        # Get Soup for file
        soup = BeautifulSoup(f.read(), "html.parser")
    try:
        course['TAFE_SA_code'] = get_TAFE_SA_code_no_try(soup)
    except AttributeError:
        print(filename, ": unable to find TAFE SA Code. Investigate manually.") 
        
    try:
        course['course_notes'] = get_course_notes_no_try(soup)
    except AttributeError:
        print(filename, ": unable to find course notes. Investigate manually.")
        
    
    try:
        course['course_types'] = get_course_types_no_try(soup)
    except AttributeError:
        print(filename, ": unable to find course type. Investigate manually.")
        
    try:
        course['admission_requirements'] = get_admission_requirements_no_try(soup)
    except AttributeError:
        print(filename, ": unable to find Course Admission Requirements. Investigate manually.")  

    return course


def parse_files(files):
    """
    Given a list of filenames
    Returns a LIST of course objects created by calling parse_file on each file with '.html' extension
    """
    length = len(files)
    courses = []
    
    for index, filename in enumerate(files):
        if filename[-5:] == '.html':  # Only open html files
            print("Parsing " + filename + " (" + str(index + 1) + " of " + str(length) + " files).")
            course = parse_file(filename)
            courses.append(course)
    
    return courses


def parse_directory(directory):
    """
    (Wrapper over parse_files)
    Given a directory path,
    Returns a LIST of course objects created by calling parse_file on each file with '.html' extension
    """
    previous_directory = os.getcwd()
    new_directory = os.chdir(directory)
    files = os.listdir(os.getcwd())
    
    courses =  parse_files(files)
    os.chdir(previous_directory)

    return courses

# 9. Parse HTML files in memory into a list of dictionary objects

In [175]:
tafe_courses = parse_directory(TF_HTML_DIR)
tafe_filenames_and_their_courses = inverse_map_url_to_course(tafe_csv_courses)

Parsing aw_AC00004.html (0 of 407 files).
Parsing aw_AC00024.html (1 of 407 files).
Parsing aw_AC00034.html (2 of 407 files).
aw_AC00034.html : unable to find course notes. Investigate manually.
Parsing aw_AC00035.html (3 of 407 files).
aw_AC00035.html : unable to find course notes. Investigate manually.
Parsing aw_AC00040.html (4 of 407 files).
Parsing aw_AC00041.html (5 of 407 files).
Parsing aw_AC00042.html (6 of 407 files).
Parsing aw_AC00043.html (7 of 407 files).
Parsing aw_AC00045.html (8 of 407 files).
Parsing aw_AC00057.html (9 of 407 files).
Parsing aw_AC00058.html (10 of 407 files).
Parsing aw_AC00059.html (11 of 407 files).
Parsing aw_AC00061.html (12 of 407 files).
Parsing aw_AC00062.html (13 of 407 files).
aw_AC00062.html : unable to find course notes. Investigate manually.
Parsing aw_AC00063.html (14 of 407 files).
aw_AC00063.html : unable to find course notes. Investigate manually.
Parsing aw_AC00064.html (15 of 407 files).
aw_AC00064.html : unable to find course notes.

# 10. Merge all three datasets (CSV, training.gov.au, and TAFE SA webpages)

In [186]:
def merge_three_datasets(tafe_csv_courses, training_courses, tafe_courses):
    merged_courses = []
    for tafe_csv_course in tafe_csv_courses:
        for training_course in training_courses:
            if training_course['National Code'] == tafe_csv_course['National Code']:
                for tafe_course in tafe_courses:
                    if tafe_course['filename'] == tafe_csv_course['tafe_filename']:
                        combined_obj = {**tafe_course, **training_course, **tafe_csv_course}
                        merged_courses.append(combined_obj)
    return merged_courses
                    
merged_courses = merge_three_datasets(tafe_csv_courses, training_courses, tafe_courses)

In [188]:
len(merged_courses)

1329

# 11. Output merged data to JSON file

In [189]:
def output_to_json_file(obj, json_output_file, directory=os.getcwd()):
    """
    Saves an object (LIST/DIRECTORY) to a JSON file
    Returns Nothing
    """
    previous_directory = os.getcwd()
    os.chdir(directory)
    with open(json_output_file, 'w') as f:
        f.write(json.dumps(obj, indent=2, sort_keys=True))
    
    os.chdir(previous_directory)
    
output_to_json_file(merged_courses, JSON_OUTPUT_FILE, wd)

### Some notes
46: CRS1400029 - Not on training.gov.au (searched on keywords and code)
1325: TAESS00009 - A skill set not a qual (thus no pertinent ASCED)
1326: UEESS00048 - A skill set not a qual (thus no pertinent ASCED)
1327: UEESS00051 - A skill set not a qual (thus no pertinent ASCED)
1328: UEESS00052 - A skill set not a qual (thus no pertinent ASCED)
1329: MEASS00443 - A skill set not a qual (thus no pertinent ASCED)


