# CAO 
***
## Author: Fionn McCarthy

## Introduction and Background
***

Below shows the libraries imported to carry out the relevant analysis

In [1]:
# Regular expressions 
import re

# Working with HTTP 
import requests as rq

# Data frames 
import pandas as pd

# Date and time library
import datetime as dt 

# For downloading from URl
import urllib.request as urlrq

### The link to the CAO points 2021 can be acccessed **[here](http://www.cao.ie/index.php?page=points&p=2021)**.

In [2]:
# Retrieve the CAO points from the URl below
CAO1 = rq.get('http://www2.cao.ie/points/l8.php')

# View the above is okay, 200 means all okay. 
CAO1 

<Response [200]>

### Saving the Original Data Set 
The datetime library is usesd in order to take a back up of the data retrieved from the CAO website.The reason the datetime library is used is to back up the data each time the notebook is run as to not overwrite the data. 

In [3]:
# date and time of right now.
now = dt.datetime.now()

# Formatting now as a string.
nowstring = now.strftime('%Y%m%d_%H%M%S')

The server advises to utilize charset 'iso-8859-1' in order to decode the but from research this charset is unable to decode \x96 as it is not defined so we use 'cp1252' instead.

In [4]:
# Setting the charset to decode with 
CAO1.encoding = 'cp1252'

In [5]:
# Setting the charset to decode with 
CAO1.encoding = 'cp1252'

# Creating a file path for the data.
path2021_html = 'data/cao2021_' + nowstring + '.html'

# Saving the original html file before going any further. 
with open(path2021_html, 'w') as f:
    f.write(CAO1.text)

### Regular expressions selects the lines in the we want to keep.

In [6]:
# The regular expression is compiled below 
re_expr = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')

In [7]:
# This fucntion will split out the points that had either an '#' or '*' associated with them.
def points_to_array(s):
    portfolio = ''
    if s[0] == '#':
        portfolio = '#'
    random = ''
    if s[-1] == '*':
        random = '*'
    points = ''
    for i in s:
        if i.isdigit():
            points = points + i
    # Return an array with points, portfolio, random        
    return [points, portfolio, random]

In [8]:
# Path creation for the csv file.
path2021_csv = 'data/cao2021_csv_' + nowstring + '.csv'

Looping through lines 

In [22]:
# Keep track of how many courses we process.
no_lines = 0

# Open and write to the cvs file
with open(path2021_csv, 'w') as f:
    # Write a header row for data set
    f.write(','.join(['code', 'title', 'points_r1', 'points_r2']) + '\n')
    # Loop through lines of the response
    for line in CAO1.iter_lines():
        # Decode using cp1252
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_expr.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # The course code, first five characters in string
            course_code = dline[:5]
            # The course title
            course_title = dline[7:57]
            # Course points split on one or more spaces after character 60
            course_points = re.split('  +', dline[60:]) 
            if len(course_points) != 2:
                course_points = course_points[:2]
            # join the fields using a comma 
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')
        
print(no_lines)

949


#### The above count of 949 courses was verified by a manual count that was carried out in visual studio code on original html file.

In [23]:
df2021 = pd.read_csv(path2021_csv, encoding='cp1252')

### The link to the CAO points 2020 can be acccessed **[here](http://www.cao.ie/index.php?page=points&p=2020)**.

#### Save Original 2020 Data

In [11]:
# Creating a file path for the excel data.
path2020_excel = 'data/cao2020_' + nowstring + '.xlsx'

In [12]:
# https://stackoverflow.com/questions/19602931/basic-http-file-downloading-and-saving-to-disk-in-python
# Save original file
urlrq.urlretrieve("http://www2.cao.ie/points/CAOPointsCharts2020.xlsx", path2020_excel) 

('data/cao2020_20211201_230138.xlsx',
 <http.client.HTTPMessage at 0x24d8e168040>)

#### Load Spreadhseet using Pandas

In [13]:
# Download and parse the excel spreadsheet
# From examining the excel file I noted that needed to skip the first 10 trows of excel file that is loaded into dataframe
df2020 = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', skiprows = 10)

In [14]:
df2020

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [15]:
df2020.iloc[-1]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

#### Save Pandas Dataframe

In [16]:
# Creating a file path for the pandas data.
path2020_csv = 'data/cao2020_' + nowstring + '.csv'

In [17]:
# Save pandas datafram to folder
df2020.to_csv(path2020_csv)

### The link to the CAO points 2019 can be acccessed **[here](http://www.cao.ie/index.php?page=points&p=2019)**.

#### In order to reproduce the data these are the steps I followed 
1. Download the original PDF file from the CAO website (link above). 
2. Open the original file using Microsoft Word. 
3. When open in Word, save the document as a word document. 
4. Re-save the Word document for editing.
5. Delete headers and page numbers in footer.
6. Select all in document and copy. 
7. Paste into Notepad++.
8. Remove institution name lines and blank lines.
9. In column headings, replace 'COURSE AND INSTITUTION' with 'Course'.
10. Change backticks to apostrophes. 
11. Delete tabs at end of lines: 61, 64, 169, 171, 172, 200, 335, 352, 425, 433, 700, 701, 752, 793, 830, 837, 869 and 894. 
12. Remove tab after 'Mid' first line.
13. Remove double tab on line 37.
14. Delete tabs at end of lines: 28, 107, 178, 201, 266, 370, 373, 391, 415, 437, 464, 494, 518, 535, 557, 571, 604, 625, 670, 679, 689, 785, 795




In [18]:
df2019 = pd.read_csv('data/cao2019_20211130_205230_edited.csv', sep='\t')

In [19]:
df2019

Unnamed: 0,Course Code,Course,EOS,Mid
0,AL801,Software Design with Virtual Reality and Gaming,304,328.0
1,AL802,Software Design with Cloud Computing,301,306.0
2,AL803,Software Design with Mobile Apps and Connected...,309,337.0
3,AL805,Network Management and Cloud Infrastructure,329,442.0
4,AL810,Quantity Surveying,307,349.0
...,...,...,...,...
925,WD200,Arts (options),221,296.0
926,WD210,Software Systems Development,271,329.0
927,WD211,Creative Computing,275,322.0
928,WD212,Recreation and Sport Management,274,311.0


#### Load Spreadhseet using Pandas

In [24]:
df2021

Unnamed: 0,code,title,points_r1,points_r2
0,AL801,Software Design for Virtual Reality and Gaming...,300,
1,AL802,Software Design in Artificial Intelligence for...,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,
3,AL805,Computer Engineering for Network Infrastructur...,321,
4,AL810,Quantity Surveying ...,328,
...,...,...,...,...
944,WD211,Creative Computing ...,270,
945,WD212,Recreation and Sport Management ...,262,
946,WD230,Mechanical and Manufacturing Engineering ...,230,230
947,WD231,Early Childhood Care and Education ...,266,


In [25]:
df2020

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [26]:
df2019

Unnamed: 0,Course Code,Course,EOS,Mid
0,AL801,Software Design with Virtual Reality and Gaming,304,328.0
1,AL802,Software Design with Cloud Computing,301,306.0
2,AL803,Software Design with Mobile Apps and Connected...,309,337.0
3,AL805,Network Management and Cloud Infrastructure,329,442.0
4,AL810,Quantity Surveying,307,349.0
...,...,...,...,...
925,WD200,Arts (options),221,296.0
926,WD210,Software Systems Development,271,329.0
927,WD211,Creative Computing,275,322.0
928,WD212,Recreation and Sport Management,274,311.0


## Concat join

In [27]:
courses2021 = df2021[['code', 'title']]
courses2021

Unnamed: 0,code,title
0,AL801,Software Design for Virtual Reality and Gaming...
1,AL802,Software Design in Artificial Intelligence for...
2,AL803,Software Design for Mobile Apps and Connected ...
3,AL805,Computer Engineering for Network Infrastructur...
4,AL810,Quantity Surveying ...
...,...,...
944,WD211,Creative Computing ...
945,WD212,Recreation and Sport Management ...
946,WD230,Mechanical and Manufacturing Engineering ...
947,WD231,Early Childhood Care and Education ...


In [30]:
courses2020 = df2020[['COURSE CODE2', 'COURSE TITLE']]
courses2020.columns = ['code', 'title']
courses2020

Unnamed: 0,code,title
0,AC120,International Business
1,AC137,Liberal Arts
2,AD101,"First Year Art & Design (Common Entry,portfolio)"
3,AD102,Graphic Design and Moving Image Design (portfo...
4,AD103,Textile & Surface Design and Jewellery & Objec...
...,...,...
1459,WD208,Manufacturing Engineering
1460,WD210,Software Systems Development
1461,WD211,Creative Computing
1462,WD212,Recreation and Sport Management


In [32]:
courses2019 = df2019[['Course Code', 'Course']]
courses2019.columns = ['code', 'title']
courses2019

Unnamed: 0,code,title
0,AL801,Software Design with Virtual Reality and Gaming
1,AL802,Software Design with Cloud Computing
2,AL803,Software Design with Mobile Apps and Connected...
3,AL805,Network Management and Cloud Infrastructure
4,AL810,Quantity Surveying
...,...,...
925,WD200,Arts (options)
926,WD210,Software Systems Development
927,WD211,Creative Computing
928,WD212,Recreation and Sport Management


In [33]:
allcourses = pd.concat([courses2021, courses2020, courses2019], ignore_index=True)
allcourses

Unnamed: 0,code,title
0,AL801,Software Design for Virtual Reality and Gaming...
1,AL802,Software Design in Artificial Intelligence for...
2,AL803,Software Design for Mobile Apps and Connected ...
3,AL805,Computer Engineering for Network Infrastructur...
4,AL810,Quantity Surveying ...
...,...,...
3338,WD200,Arts (options)
3339,WD210,Software Systems Development
3340,WD211,Creative Computing
3341,WD212,Recreation and Sport Management


In [34]:
allcourses.sort_values('code')

Unnamed: 0,code,title
175,AC120,International Business ...
949,AC120,International Business
2581,AC120,International Business
950,AC137,Liberal Arts
2582,AC137,Liberal Arts
...,...,...
2412,WD230,Mechanical and Manufacturing Engineering
946,WD230,Mechanical and Manufacturing Engineering ...
3342,WD230,Mechanical and Manufacturing Engineering
947,WD231,Early Childhood Care and Education ...


In [37]:
# Finds all extra copies of duplicated rows.
allcourses[allcourses.duplicated()]

Unnamed: 0,code,title
952,AD102,Graphic Design and Moving Image Design (portfo...
1145,CR220,Fine Art at CIT Crawford College of Art and De...
1195,CW068,Applied Social Studies in Professional Social ...
1762,LM076,Product Design and Technology (portfolio requi...
2049,TR034,Management Science and Information Systems Stu...
...,...,...
3338,WD200,Arts (options)
3339,WD210,Software Systems Development
3340,WD211,Creative Computing
3341,WD212,Recreation and Sport Management


In [38]:
# Returns a copy of the data frame with duplciates removed.
allcourses.drop_duplicates()

Unnamed: 0,code,title
0,AL801,Software Design for Virtual Reality and Gaming...
1,AL802,Software Design in Artificial Intelligence for...
2,AL803,Software Design for Mobile Apps and Connected ...
3,AL805,Computer Engineering for Network Infrastructur...
4,AL810,Quantity Surveying ...
...,...,...
3282,TL802,"TV, Radio and New Media"
3283,TL803,Music Technology
3286,TL812,Computing with Digital Media
3290,TL842,Construction Management


In [39]:
# Finds all extra copies of duplicated rows.
allcourses[allcourses.duplicated(subset=['code'])]

Unnamed: 0,code,title
949,AC120,International Business
950,AC137,Liberal Arts
951,AD101,"First Year Art & Design (Common Entry,portfolio)"
952,AD102,Graphic Design and Moving Image Design (portfo...
953,AD103,Textile & Surface Design and Jewellery & Objec...
...,...,...
3338,WD200,Arts (options)
3339,WD210,Software Systems Development
3340,WD211,Creative Computing
3341,WD212,Recreation and Sport Management


In [40]:
# Returns a copy of the data frame with duplciates removed - based only on code.
allcourses.drop_duplicates(subset=['code'], inplace=True, ignore_index=True)

In [41]:
allcourses

Unnamed: 0,code,title
0,AL801,Software Design for Virtual Reality and Gaming...
1,AL802,Software Design in Artificial Intelligence for...
2,AL803,Software Design for Mobile Apps and Connected ...
3,AL805,Computer Engineering for Network Infrastructur...
4,AL810,Quantity Surveying ...
...,...,...
1646,SG441,Environmental Science
1647,SG446,Applied Archaeology
1648,TL803,Music Technology
1649,TL812,Computing with Digital Media


---

## References 