## This notebook will hold an overview of how to load and analyse CAO Points data from the CAO website. 

### CAO Points 2021
http://www.cao.ie/index.php?page=points&p=2021

In [1]:
# Import the regex library
import re

import pandas as pd

# Import the web requests library
import requests as rq

# Import the datetime library, will use for timestamping files
import datetime as dt

# Import os library, will use this to generate folder paths for saving files. 
import os



In [2]:
# Set up the url for the CAO points for 2021
cao2021url = 'http://www2.cao.ie/points/l8.php'

In [3]:
#Get the points data from the CAO website.
resp = rq.get(cao2021url)

#Check the response status, if it is 200 all ok. 
resp

<Response [200]>

In [4]:
# Set the encoding type for the data, needed to decode the non standard characters in the data set.
resp.encoding = 'cp1252'

In [5]:
# Use os.cwd to generate the current working directory for saving the output files from the analysis.
cwd = os.getcwd()
print(cwd)

C:\Users\donne\OneDrive\Documents\GMIT Data\Fundamentals of Data Analysis\Assessment\data_analysis_assessment


In [6]:
now = dt.datetime.now()
#path = cwd + '/data'+'/cao2021_' + now.strftime('%Y%m%d_%H%M%S')+'.csv'
path = cwd + '/CAO_Data'+'/cao2021.csv'
print(path)

C:\Users\donne\OneDrive\Documents\GMIT Data\Fundamentals of Data Analysis\Assessment\data_analysis_assessment/CAO_Data/cao2021.csv


In [7]:
# Compile the regex expression that will be used to extract the course and points data.
# Trying a number of different options
#re_course = re.compile('([\w]{5})  (.*)([\d]{3})(\*?) *')
#re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')
#re_course = re.compile(r'([A-Z]{2}[0-9]{3})')

In [8]:
# Pull and display the CAO points data. 
linecount = 0
with open(path,'w') as f:
    f.write(','.join(['ccode','ctitle']) +'\n')
    for line in resp.iter_lines():
        #if re_course.fullmatch(line.decode('cp1252')):  
        if re.match('[A-Z]{2}[0-9]{3}', line.decode('cp1252')):
            dline = (line.decode('cp1252'))
            dcode = str(dline[:5])
            ccode = str(dline[7:58])
            course = dcode +', '+ccode
            #print(course)
            linecount = linecount+1
            #csv_ver = re_course.sub(r'\1,\2,\3,\4', line.decode('cp1252'))
            #csv_ver = re.split('  +',line.decode('cp1252'))
            #f.write(','.join(csv_ver) + '\n')
            f.write(course + '\n')
            #print(','.join(csv_ver))

In [9]:
linecount

949

In [10]:
#Read in the 2021 points CSV file.
courses2021 = pd.read_csv(path)
courses2021

Unnamed: 0,ccode,ctitle
0,AL801,Software Design for Virtual Reality and Gamin...
1,AL802,Software Design in Artificial Intelligence fo...
2,AL803,Software Design for Mobile Apps and Connected...
3,AL805,Computer Engineering for Network Infrastructu...
4,AL810,Quantity Surveying ...
...,...,...
944,WD211,Creative Computing ...
945,WD212,Recreation and Sport Management ...
946,WD230,Mechanical and Manufacturing Engineering ...
947,WD231,Early Childhood Care and Education ...


### CAO Points 2020

http://www2.cao.ie/points/CAOPointsCharts2020.xlsx

In [11]:
# Set up the url for the 2020 points data.It is already in excel formst.
cao2020url = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'

In [12]:
# Read in the excel file for the 2020 points data.
df2020 = pd.read_excel(cao2020url, skiprows=10, usecols='A:O')

In [13]:
# Set up the datafrome to select 2 columns from the data.
courses2020 = df2020[['COURSE CODE2', 'COURSE TITLE']]
# Give the dataframe standard column names, will use the same ones as the 2021 set.
courses2020.columns = ['ccode', 'ctitle']
courses2020

Unnamed: 0,ccode,ctitle
0,AC120,International Business
1,AC137,Liberal Arts
2,AD101,"First Year Art & Design (Common Entry,portfolio)"
3,AD102,Graphic Design and Moving Image Design (portfo...
4,AD103,Textile & Surface Design and Jewellery & Objec...
...,...,...
1459,WD208,Manufacturing Engineering
1460,WD210,Software Systems Development
1461,WD211,Creative Computing
1462,WD212,Recreation and Sport Management


In [14]:
#df.to_csv("caopoints_2020.csv", index=False)

### CAO Points 2019

In [15]:
# Set up the url for the 2019 points data, note it is a .pdf file. 
cao2019url = 'http://www2.cao.ie/points/lvl8_19.pdf'

#### Steps to extract the pdf points data from the 2019 CAO file. 
There are a number of ways to turn the pdf file into excel. To save time following was done:
- Open the file from the link above in Chrome.
- The built in Adobe Acrobat Chrome extension has a selection of file conversion tools, one of which is pdf to excel conversion. 
- The resulting excel file opens in the browser and can be downloaded to a system folder. 
- The non needed rows are deleted directly in the resulting file. 

In [25]:
# Read in the 2019 points excel file.
df2019 = pd.read_excel('lvl8_2019.xlsx')
df2019

Unnamed: 0,Course Code,Course,EOS,Mid
0,AL801,Software Design with Virtual Reality and Gaming,304,328.0
1,AL802,Software Design with Cloud Computing,301,306.0
2,AL803,Software Design with Mobile Apps and Connected...,309,337.0
3,AL805,Network Management and Cloud Infrastructure,329,442.0
4,AL810,Quantity Surveying,307,349.0
...,...,...,...,...
925,WD200,Arts (options),221,296.0
926,WD210,Software Systems Development,271,329.0
927,WD211,Creative Computing,275,322.0
928,WD212,Recreation and Sport Management,274,311.0


In [26]:
# Select the first 2 columns and standardise the column names.
courses2019 = df2019[['Course Code', 'Course']]
courses2019.columns = ['ccode', 'ctitle']
courses2019

Unnamed: 0,ccode,ctitle
0,AL801,Software Design with Virtual Reality and Gaming
1,AL802,Software Design with Cloud Computing
2,AL803,Software Design with Mobile Apps and Connected...
3,AL805,Network Management and Cloud Infrastructure
4,AL810,Quantity Surveying
...,...,...
925,WD200,Arts (options)
926,WD210,Software Systems Development
927,WD211,Creative Computing
928,WD212,Recreation and Sport Management


In [18]:
# Join the 2021 and 2020 file.
allcourses = pd.concat([courses2021,courses2020])
allcourses

Unnamed: 0,ccode,ctitle
0,AL801,Software Design for Virtual Reality and Gamin...
1,AL802,Software Design in Artificial Intelligence fo...
2,AL803,Software Design for Mobile Apps and Connected...
3,AL805,Computer Engineering for Network Infrastructu...
4,AL810,Quantity Surveying ...
...,...,...
1459,WD208,Manufacturing Engineering
1460,WD210,Software Systems Development
1461,WD211,Creative Computing
1462,WD212,Recreation and Sport Management


In [19]:
# Check which course codes are duplicated.
allcourses[allcourses.duplicated(subset=['ccode'])]

Unnamed: 0,ccode,ctitle
0,AC120,International Business
1,AC137,Liberal Arts
2,AD101,"First Year Art & Design (Common Entry,portfolio)"
3,AD102,Graphic Design and Moving Image Design (portfo...
4,AD103,Textile & Surface Design and Jewellery & Objec...
...,...,...
1455,WD200,Arts (options)
1460,WD210,Software Systems Development
1461,WD211,Creative Computing
1462,WD212,Recreation and Sport Management


In [20]:
# Only show the unique course codes. 
allcourses.drop_duplicates(subset=['ccode'])

Unnamed: 0,ccode,ctitle
0,AL801,Software Design for Virtual Reality and Gamin...
1,AL802,Software Design in Artificial Intelligence fo...
2,AL803,Software Design for Mobile Apps and Connected...
3,AL805,Computer Engineering for Network Infrastructu...
4,AL810,Quantity Surveying ...
...,...,...
1449,WD188,Applied Health Care
1456,WD205,Molecular Biology with Biopharmaceutical Science
1457,WD206,Electronic Engineering
1458,WD207,Mechanical Engineering
