# CAO Points Analysis

http://www.cao.ie/index.php?page=points&p=2021

***

In [1]:
# Regular expressions
import re

# Convenient package for making HTTP requests. Would normally need to install with Python but comes with Anaconda. You

import requests as rq

# Dates and Times
import datetime as dt


In [2]:
# Fetch the CAO points URL. resp is short for response
resp = rq.get('http://www2.cao.ie/points/l8.php')
# Have a quick peek.
resp       # Response 200 means all is okay. 404 means not found

<Response [200]>

<br>

## Save original dataset

In [3]:
# Get current date and time
now = dt.datetime.now()

# Format as a string.
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [4]:
# Create a filepath for original data
path = 'data/cao2021_' +  nowstr + '.html'

In [5]:
# The server uses the wrong encoding, fix it
original_encoding = resp.encoding
# Change it to cp1252
resp.encoding = 'cp1252'

In [6]:
# Save the original html file
with open(path, 'w') as f:
    f.write(resp.text)

In [7]:
resp.text

'<html>\n<BODY>\n<a name="deg"><hr>\n<h3><center> Points Required for Entry to 2021 Level 8 Courses</h3></center><p>\n<A HREF="l76.php">Go to Level 7/6 List of Institutions.</a><p>\n<PRE><b>\n*     Not all on this points score were offered places.\n#     Test / Interview / Portfolio / Audition\nAQA   All qualified applicants\nv     New competition for available places\n \n</PRE>\n<dl>\n<p>\n<dt><a href="#ac8"><b>American College Dublin</b></a>\n<dt><a href="#al8"><b>Athlone Institute of Technology</b></a>\n<dt><a href="#by8"><b>IBAT College Dublin</b></a>\n<dt><a href="#cm8"><b>Marino Institute of Education</b></a>\n<dt><a href="#pc8"><b>Carlow College</b></a>\n<dt><a href="#ct8"><b>CCT College Dublin</b></a>\n<dt><a href="#cr8"><b>Cork Institute of Technology</b></a>\n<dt><a href="#db8"><b>Dublin Business School</b></a>\n<dt><a href="#dc8"><b>Dublin City University</b></a>\n<dt><a href="#dk8"><b>Dundalk Institute of Technology</b></a>\n<dt><a href="#dl8"><b>Dun Laoghaire Institute of 

<br>

# Error on server

***

Technically the server says we should decode as iso-8859

```
Error
```

However, one line uses \x96 which isnt defined in iso-8859-1
Therefore we use the similar decoding standard cp1252 which is similar but  includes \x96


# Use regular expressions to select lines we want

***

In [8]:
# Compile the regular expression for matching lines.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *') # the .* is filler, can be basically anything. we are using it for the course name and spaces at end
# the 'r' at front means 'raw'. If you put at front it means 'dont evaluate backslashes'. Treat as a raw string
# (\*?) Asterik means 'zero or more of'. + means 1 or more of. "88*", asterik applies to second 8. this meams "8+" one or more eights
# (\*?) Backslah means dont treat asterik as a quantifer, treat as a backslash. Question mark is the quantifier, it means zero or one of.
 # Space with asterik means any number of spaces


### Loop through the lines of the response

***

In [9]:
#The filepath for the csv file.

# Data almost in correct format for analysis. Unicode, decoding, errors, look at 2021 points for CAO.
# 2020 DATA IS IN A SPREADSHEET SO CAN SAVE STRAIGHT OFF THE BAT. 2019 in PDF format if all else fails save manualy.


path = 'data/cao2021_csv_' +  nowstr + '.csv'

# keep track of how many courses we process
no_lines = 0

# Open the csv file for writing.

with open (path, 'w') as f:
    # Loop through the lines of the response.
    for line in resp.iter_lines(): # took from google stack overflow.
        # Decode line, using the wrong encoding!
        dline = line.decode('cp1252')
        # match only the lines we want - the ones representing courses.
        if re_course.fullmatch(dline):    # getting set up for regular expression. 'pattern is the regular expression itself'
            # Add one to the lines counter
            no_lines = no_lines + 1
            # Uncomment next line to see the original
            #print(line)   # the .* is filler, can be basically anything. we are using it for the course name and spaces at end
            # Pick out the relevant parts of the matched line
            #csv_version = re_course.sub(r'\1,\2,\3,\4', dline) # go in and one reg expres, what we want back isgroups of regular expression (/1 is first part, /2 is second part and so on. Reg exp above has 4 parts.)  
            # Print the CSV-style line
            #print(csv_version)
            # split the line on two or more spaces
            linesplit = re.split('  +', dline) # twos space and plus means two or more space
            #print(','.join(linesplit))
            # rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')
            
# Print the totalnumber of processed lines.
print(f"Total number of lines is {no_lines}.")

# consider sense checking

Total number of lines is 922.
