## This notebook will hold an overview of how to load and analyse CAO Points data from the CAO website. 

### CAO Points 2021
http://www.cao.ie/index.php?page=points&p=2021

In [1]:
# Import the regex library
import re

import pandas as pd

# Import the web requests library
import requests as rq

# Import the datetime library, will use for timestamping files
import datetime as dt

# Import os library, will use this to generate folder paths for saving files. 
import os



In [2]:
# Set up the url for the CAO points for 2021
cao2021url = 'http://www2.cao.ie/points/l8.php'

In [3]:
#Get the points data from the CAO website.
resp = rq.get(cao2021url)

#Check the response status, if it is 200 all ok. 
resp

<Response [200]>

In [4]:
resp.encoding = 'cp1252'

In [5]:
# Use os.cwd to generate the current working directory for saving teh output files from teh analysis.
cwd = os.getcwd()
print(cwd)

C:\Users\donne\OneDrive\Documents\GMIT Data\Fundamentals of Data Analysis\Assessment\data_analysis_assessment


In [6]:
now = dt.datetime.now()
path = cwd + '/cao2021_' + now.strftime('%Y%m%d_%H%M%S')+'.csv'
print(path)

C:\Users\donne\OneDrive\Documents\GMIT Data\Fundamentals of Data Analysis\Assessment\data_analysis_assessment/cao2021_20211101_204959.csv


In [7]:
# Compile the regex expression that will be used to extract the course and points data.
#re_course = re.compile('([\w]{5})  (.*)([\d]{3})(\*?) *')
#re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

In [8]:
# Pull and display the CAO points data. 
linecount = 0
with open(path,'w') as f:
    for line in resp.iter_lines():
        #if re_course.fullmatch(line.decode('cp1252')):  
        if re.match('[A-Z]{2}[0-9]{3}', line.decode('cp1252')):
            linecount = linecount+1
            #print(line)
            #csv_ver = re_course.sub(r'\1,\2,\3,\4', line.decode('cp1252'))
            csv_ver = re.split('  +',line.decode('cp1252'))
            f.write(','.join(csv_ver) + '\n')
            #print(','.join(csv_ver))

### CAO Points 2020

http://www2.cao.ie/points/CAOPointsCharts2020.xlsx

In [9]:
cao2020url = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'

In [10]:
df = pd.read_excel(cao2020url, skiprows=10, usecols='A:O')

In [11]:
df

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,LEVEL,HEI,Test/Interview #,avp,v
0,Business and administration,International Business,AC120,209,,,,209,,280,8,American College,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,8,American College,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,7,Waterford Institute of Technology,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,8,Waterford Institute of Technology,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,8,Waterford Institute of Technology,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,8,Waterford Institute of Technology,,,


In [12]:
df.to_csv("caopoints_2020.csv", index=False)

### CAO Points 2019

In [13]:
cao2019url = 'http://www2.cao.ie/points/lvl8_19.pdf'

### Steps to extract the pdf points data from the 2019 CAO file. 

In [16]:
df2019 = pd.read_csv('Level8_2019.csv')

In [17]:
df2019

Unnamed: 0,Course Code,Institution,INSTITUTION,EOS,Mid
0,AL801,Athlone Institute of Technology,Software Design with Virtual Reality and Gaming,304,328.0
1,AL802,Athlone Institute of Technology,Software Design with Cloud Computing,301,306.0
2,AL803,Athlone Institute of Technology,Software Design with Mobile Apps and Connected...,309,337.0
3,AL805,Athlone Institute of Technology,Network Management and Cloud Infrastructure,329,442.0
4,AL810,Athlone Institute of Technology,Quantity Surveying,307,349.0
...,...,...,...,...,...
925,WD200,Waterford Institute of Technology,Arts (options),221,296.0
926,WD210,Waterford Institute of Technology,Software Systems Development,271,329.0
927,WD211,Waterford Institute of Technology,Creative Computing,275,322.0
928,WD212,Waterford Institute of Technology,Recreation and Sport Management,274,311.0


In [18]:
df2019.to_csv("caopoints_2019.csv", index=False)