# **ETL for CourseData.db**
A step-by-step walkthrough

## Preliminaries: Extensions, Imports, and Database Connections

In [1]:
%load_ext sql
import pandas as pd
import sqlite3

In [2]:
%sql sqlite:///CourseData.db
conn = sqlite3.connect('CourseData.db')

## 1. Create Tables from ERD

In [3]:
%%sql

-- Courses table
DROP TABLE IF EXISTS COURSES;
CREATE TABLE COURSES (
    CourseID INTEGER PRIMARY KEY,
    ProgramCode TEXT NOT NULL,
    CatalogCode TEXT NOT NULL
);

-- Instructors table
DROP TABLE IF EXISTS INSTRUCTORS;
CREATE TABLE INSTRUCTORS (
    InstructorID INTEGER PRIMARY KEY,
    Name TEXT NOT NULL
);
CREATE INDEX ix_instructors_name on INSTRUCTORS(Name);

-- Course Offerings table
DROP TABLE IF EXISTS COURSE_OFFERINGS;
CREATE TABLE COURSE_OFFERINGS (
    CourseOfferingID INTEGER PRIMARY KEY,
    CourseID INTEGER,
    CatalogID TEXT NOT NULL,
    Term TEXT NOT NULL,
    CRN INTEGER NOT NULL,
    Section TEXT NOT NULL,
    Credits REAL,
    Title TEXT NOT NULL,
    Timecodes TEXT,
    PrimaryInstructorID INTEGER,
    Instructors TEXT,
    Capacity INTEGER NOT NULL,
    Actual INTEGER NOT NULL,
    Remaining INTEGER NOT NULL,
    FOREIGN KEY (CourseID) REFERENCES CATALOG_COURSES(CourseID),
    FOREIGN KEY (PrimaryInstructorID) REFERENCES INSTRUCTORS(InstructorID)
);
CREATE INDEX ix_course_offerings_alt_key on COURSE_OFFERINGS(Term,CatalogID,Section);

-- Locations table
DROP TABLE IF EXISTS LOCATIONS;
CREATE TABLE LOCATIONS (
    LocationID INTEGER PRIMARY KEY,
    LocationCode TEXT NOT NULL
);

-- Course Meetings table
DROP TABLE IF EXISTS COURSE_MEETINGS;
CREATE TABLE COURSE_MEETINGS (
    CourseMeetingID INTEGER PRIMARY KEY,
    CourseOfferingID INTEGER NOT NULL,
    LocationID INTEGER NOT NULL,
    StartDateTime TEXT NOT NULL,
    EndDateTime TEXT NOT NULL,
    FOREIGN KEY (CourseOfferingID) REFERENCES COURSE_OFFERINGS(CourseOfferingID),
    FOREIGN KEY (LocationID) REFERENCES LOCATIONS(LocationID)
);

 * sqlite:///CourseData.db
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.


[]

## 2. Extract data from CSV files
_The following code uses Python to automate the dirty work that you might do in `sqlite3`._

In [4]:
# Course Offering and Course Meeting Data
terms = [   'Fall2014', 'Winter2015', 'Spring2015', 'Summer2015',
            'Fall2015', 'Winter2016', 'Spring2016', 'Summer2016',
            'Fall2016', 'Winter2017', 'Spring2017', 'SpringBreak2017', 'Summer2017',
            'Fall2017', 'Winter2018','Spring2018','Summer2018',
            'Fall2018', 'Winter2019','Spring2019','SummerI2019','SummerII2019','Summer2019',
            'Fall2019', 'Winter2020','Spring2020','Summer2020',
            'Fall2020', 'Winter2021','Spring2021','Summer2021',
            'Fall2021', 'Winter2022','Spring2022','Summer2022',
            'Fall2022'
        ]

for term in terms:
    filepath = 'SourceData/'+term+'/courses.csv'
    data = pd.read_csv(filepath)
    data.to_sql('IMPORT_COURSE_OFFERINGS',conn,if_exists='append',index=False) 
    
    filepath = 'SourceData/'+term+'/course_meetings.csv'
    data = pd.read_csv(filepath)
    data.to_sql('IMPORT_COURSE_MEETINGS',conn,if_exists='append',index=False)

In [5]:
%%sql
-- Record Counts for Course Offerings
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_OFFERINGS) as 'RawCount',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_OFFERINGS)) as 'DistinctCount';

 * sqlite:///CourseData.db
Done.


RawCount,DistinctCount
28971,28971


In [6]:
%%sql 
-- Record Counts for Course Meetings
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_MEETINGS) as 'RawCount',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_MEETINGS)) as 'DistinctCount';

 * sqlite:///CourseData.db
Done.


RawCount,DistinctCount
503907,503819


__Note: it looks like there are lots of duplicate course meetings in the course_meetings.csv data. We'll have to take care when loading the COURSE_MEETINGS table.__

In [7]:
%%sql 
SELECT term,crn,location, day, start 
FROM IMPORT_COURSE_MEETINGS
GROUP BY term,crn,location, day, start
HAVING COUNT(*)>1;

 * sqlite:///CourseData.db
Done.


term,crn,location,day,start
Fall2014,73073,MCA 102,M,2014-09-08T18:30:00
Fall2014,73073,MCA 102,M,2014-09-15T18:30:00
Fall2014,73073,MCA 102,M,2014-09-22T18:30:00
Fall2014,73073,MCA 102,M,2014-09-29T18:30:00
Fall2014,73073,MCA 102,M,2014-10-06T18:30:00
Fall2014,73073,MCA 102,M,2014-10-20T18:30:00
Fall2014,73073,MCA 102,M,2014-10-27T18:30:00
Fall2014,73073,MCA 102,M,2014-11-03T18:30:00
Fall2014,73073,MCA 102,M,2014-11-10T18:30:00
Fall2014,73073,MCA 102,M,2014-11-17T18:30:00


**After consulting the original `banner.html` files, it appears that the duplicate meeting times are in Banner! Ugh. We'll just filter out the duplicates when populating our tables with data.**

## 3 & 4. Transform and Load Data Into ERD Tables

In [8]:
%%sql
DELETE FROM INSTRUCTORS;

INSERT INTO INSTRUCTORS (Name)
SELECT DISTINCT primary_instructor
FROM import_course_offerings 
WHERE primary_instructor <> 'TBA' AND primary_instructor NOT LIKE '%/%';

 * sqlite:///CourseData.db
0 rows affected.
1570 rows affected.


[]

In [9]:
%%sql
DELETE FROM COURSES;
INSERT INTO COURSES (ProgramCode,CatalogCode)
SELECT DISTINCT substr(catalog_id,1,instr(catalog_id," ")) as program_code, catalog_id as catalog_code
FROM IMPORT_COURSE_OFFERINGS
ORDER BY catalog_code;

 * sqlite:///CourseData.db
0 rows affected.
4063 rows affected.


[]

In [10]:
%%sql
DELETE FROM COURSE_OFFERINGS;

INSERT INTO COURSE_OFFERINGS (CourseID,Term,CRN,CatalogID,Section,Credits,Title,Timecodes,PrimaryInstructorID,Capacity, Actual, Remaining)
SELECT DISTINCT CourseID,term,crn,catalog_id,section,import_course_offerings.credits,title,timecodes,InstructorID,cap,act,rem 
FROM import_course_offerings 
    LEFT JOIN INSTRUCTORS ON (primary_instructor=INSTRUCTORS.Name)
    LEFT JOIN COURSES ON (catalog_id = COURSES.CatalogCode)
;

 * sqlite:///CourseData.db
0 rows affected.
28971 rows affected.


[]

In [11]:
%%sql 
DELETE FROM LOCATIONS;

INSERT INTO LOCATIONS (LocationCode)
SELECT DISTINCT Location 
FROM import_course_meetings
ORDER BY Location

 * sqlite:///CourseData.db
0 rows affected.
247 rows affected.


[]

In [12]:
%%sql
DELETE FROM COURSE_MEETINGS;

INSERT INTO COURSE_MEETINGS (CourseOfferingID,LocationID,StartDateTime,EndDateTime)
SELECT DISTINCT COURSE_OFFERINGS.CourseOfferingID, LocationID,`Start`,`End`
FROM import_course_meetings 
    JOIN COURSE_OFFERINGS USING (Term,CRN)
    LEFT JOIN LOCATIONS ON (import_course_meetings.Location = LOCATIONS.LocationCode);

 * sqlite:///CourseData.db
0 rows affected.
503819 rows affected.


[]

In [13]:
%%sql
SELECT LocationCode, max(Capacity)
FROM COURSE_MEETINGS JOIN COURSE_OFFERINGS USING (CourseOfferingID) JOIN LOCATIONS USING (LocationID)
GROUP BY LocationID
ORDER BY LocationCode;

 * sqlite:///CourseData.db
Done.


LocationCode,max(Capacity)
BCC 200,35
BD,6
BH,11
BH BY ARR,16
BLM 112,15
BLM LL105,30
BNW 124,40
BNW 127,38
BNW 128,25
BNW 129B,30


## 5. Integrity Checks

### Domain Integrity
The SQLite data types are pretty limited, so there is not much to see here. A few specific value errors were corrected on import. 

### Entity Integrity

In [14]:
%%sql
-- There should be 28971 Course Offerings, and 503819 Course Meetings
SELECT 
    (SELECT Count(*) FROM COURSE_OFFERINGS) as CourseOfferings,
    (SELECT Count(*) FROM COURSE_MEETINGS) as CourseMeetings;

 * sqlite:///CourseData.db
Done.


CourseOfferings,CourseMeetings
28971,503819


### Relational Integrity

In [19]:
%%sql 
-- A check of COURSE_OFFERINGS --> INSTRUCTORS, COURSE_OFFERINGS --> COURSES
SELECT CourseID,Term,CRN, Section,COURSE_OFFERINGS.CatalogID as CatID, Title,Capacity,Actual,Remaining,substr(Term,-4) as Year
FROM COURSE_OFFERINGS 
    LEFT JOIN INSTRUCTORS ON (COURSE_OFFERINGS.PrimaryInstructorID = INSTRUCTORS.InstructorID)
    LEFT JOIN COURSES USING (CourseID)
WHERE Name like '%Huntley'
ORDER BY Year,Term DESC,CatID,Section;

 * sqlite:///CourseData.db
Done.


CourseID,Term,CRN,Section,CatID,Title,Capacity,Actual,Remaining,Year
1997,Fall2014,70369,E,IS 0100,Intro to Information Systems,25,26,-1,2014
1998,Fall2014,73060,A,IS 0135,Fundamentals of Web Design,25,26,-1,2014
2005,Fall2014,73061,A,IS 0320,Systems Design and Implementation,25,16,9,2014
1997,Fall2015,75231,E,IS 0100,Intro to Information Systems,29,28,1,2015
1997,Fall2015,75246,F,IS 0100,Intro to Information Systems,29,28,1,2015
1998,Fall2015,76388,A,IS 0135,Fundamentals of Web Design,25,21,4,2015
2005,Fall2015,76389,A,IS 0320,Systems Design and Implementation,25,13,12,2015
2017,Spring2016,38780,01,IS 0585,Contemporary Topics: Information Systems and Data,20,15,5,2016
2894,Spring2016,37253,B,OM 0101,Operations Management,29,28,1,2016
2894,Spring2016,37254,C,OM 0101,Operations Management,29,29,0,2016


In [20]:
%%sql
-- A check of the COURSE_MEETINGS --> COURSE_OFFERINGS relationship
SELECT Term, CourseOfferingID, Count(CourseMeetingID)
FROM COURSE_OFFERINGS JOIN COURSE_MEETINGS USING (CourseOfferingID)
WHERE CRN=39006 and Term = 'Spring2019'
GROUP BY CourseOfferingID;

 * sqlite:///CourseData.db
Done.


Term,CourseOfferingID,Count(CourseMeetingID)
Spring2019,14710,9


In [21]:
%%sql
-- Check the course meetings for a known course offering
SELECT CourseOfferingID, CourseMeetingID, StartDateTime
FROM COURSE_OFFERINGS JOIN COURSE_MEETINGS USING (CourseOfferingID)
WHERE CRN=39006 AND Term="Spring2019"

 * sqlite:///CourseData.db
Done.


CourseOfferingID,CourseMeetingID,StartDateTime
14710,262389,2019-01-29T18:30:00
14710,262390,2019-02-05T18:30:00
14710,262391,2019-02-12T18:30:00
14710,262392,2019-02-26T18:30:00
14710,262393,2019-03-05T18:30:00
14710,262394,2019-03-12T18:30:00
14710,262395,2019-02-02T09:00:00
14710,262396,2019-02-16T09:00:00
14710,262397,2019-03-09T09:00:00


## 6. Empty out the `IMPORT_` tables to reclaim storage space

In [23]:
%%sql
-- Drop the tables
DROP TABLE IMPORT_COURSE_OFFERINGS;
DROP TABLE IMPORT_COURSE_MEETINGS; 

 * sqlite:///CourseData.db
Done.
Done.


[]

In [24]:
%%sql
-- Force SQLite to rebuild the database file to minimize file size
vacuum;

 * sqlite:///CourseData.db
Done.


[]