# CAO Points Analysis


***

In [1]:
# Convenient HTTP requests.
import requests as rq

# Regular expressions.
import re

# Dates and times.
import datetime as dt

# Data frames. (dataframes is another way to say spreadsheets)
import pandas as pd

# For downloading.
import urllib.request as urlrq

<br>

## 2021 Points

http://www.cao.ie/index.php?page=points&p=2021


***

In [2]:
# Fetch the CAO points URL. resp is short for response
resp = rq.get('http://www2.cao.ie/points/l8.php')
# Have a quick peek.
resp       # Response 200 means all is okay. 404 means not found

<Response [200]>

<br>

## Save original dataset

In [3]:
# Get current date and time
now = dt.datetime.now()

# Format as a string.
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [4]:
# Create a filepath for original data
path = 'data/cao2021_' +  nowstr + '.html'

<br>

# Error on server

***

Technically the server says we should decode as iso-8859

```
Error
```

However, one line uses \x96 which isnt defined in iso-8859-1
Therefore we use the similar decoding standard cp1252 which is similar but  includes \x96

In [5]:
# The server uses the wrong encoding, fix it
original_encoding = resp.encoding
# Change it to cp1252
resp.encoding = 'cp1252'

In [6]:
# Save the original html file
with open(path, 'w') as f:
    f.write(resp.text)

# Use regular expressions to select lines we want

***

In [7]:
# Compile the regular expression for matching lines.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *') # the .* is filler, can be basically anything. we are using it for the course name and spaces at end
# the 'r' at front means 'raw'. If you put at front it means 'dont evaluate backslashes'. Treat as a raw string
# (\*?) Asterik means 'zero or more of'. + means 1 or more of. "88*", asterik applies to second 8. this meams "8+" one or more eights
# (\*?) Backslah means dont treat asterik as a quantifer, treat as a backslash. Question mark is the quantifier, it means zero or one of.
 # Space with asterik means any number of spaces


### Loop through the lines of the response

***

In [8]:
#The filepath for the csv file.

# Data almost in correct format for analysis. Unicode, decoding, errors, look at 2021 points for CAO.
# 2020 DATA IS IN A SPREADSHEET SO CAN SAVE STRAIGHT OFF THE BAT. 2019 in PDF format if all else fails save manualy.


path = 'data/cao2021_csv_' +  nowstr + '.csv'

# keep track of how many courses we process
no_lines = 0

# Open the csv file for writing.

with open (path, 'w') as f:
    # Loop through the lines of the response.
    for line in resp.iter_lines(): # took from google stack overflow.
        # Decode line, using the wrong encoding!
        dline = line.decode('cp1252')
        # match only the lines we want - the ones representing courses.
        if re_course.fullmatch(dline):    # getting set up for regular expression. 'pattern is the regular expression itself'
            # Add one to the lines counter
            no_lines = no_lines + 1
            # Uncomment next line to see the original
            #print(line)   # the .* is filler, can be basically anything. we are using it for the course name and spaces at end
            # Pick out the relevant parts of the matched line
            #csv_version = re_course.sub(r'\1,\2,\3,\4', dline) # go in and one reg expres, what we want back isgroups of regular expression (/1 is first part, /2 is second part and so on. Reg exp above has 4 parts.)  
            # Print the CSV-style line
            #print(csv_version)
            # split the line on two or more spaces
            linesplit = re.split('  +', dline) # twos space and plus means two or more space
            #print(','.join(linesplit))
            # rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')
            
# Print the totalnumber of processed lines.
print(f"Total number of lines is {no_lines}.")

# consider sense checking

Total number of lines is 922.


We need to separate points in this file

<br>

## 2020 Points

http://www.cao.ie/index.php?page=points&p=2020

***


<br>

### Save Original File

***

In [9]:
# Create a filepath for original data. nowstr is datetime package . 
path = 'data/cao2020_' +  nowstr + '.xlsx'

In [10]:
# Save original file to disk.
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path) # 

('data/cao2020_20211031_184506.xlsx',
 <http.client.HTTPMessage at 0x2b8936078b0>)

<br>

#### Load Spreadsheet using Pandas

***

In [11]:
# Download and parse the excel spreadsheet
# Read excel function turnsnit into a dataframe in memory so we are now dealing with pandas df from here on
df = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', skiprows = 10) # pandas dataframe

In [12]:
df

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [13]:
df.iloc[753] # in Pandas, iloc gets i-location (row location). loc works for name. Spot check, random row

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Road Transport Technology and Management
COURSE CODE2                                                           LC286
R1 POINTS                                                                264
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      264
EOS Random *                                                             NaN
EOS Mid-point                                                            360
LEVEL                                                                      7
HEI                                         Limerick Institute of Technology
Test/Interview #                                                         NaN

In [14]:
df.iloc[1463]  # checking final row. A -1 would work as well (slicing) Spot check, final row

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

In [15]:
# Create a filepath for the Pandas data
path = 'data/cao2020_' +  nowstr + '.csv'

In [16]:
# Save Pandas dataframe to disk. Look up to_csv to get rid of row index
df.to_csv(path)