# **ELT for `CourseDataWarehouse.db`**
Step-by-step from `CourseData.db` to `CourseDataWarehouse.db`.
- There are of course many possible warehouse designs and build processes. One could, for example, do most of the ETL work in python. 
- What follows is an ELT (Extract, Load, Transform) script that builds a data warehouse from a live data warehouse.

## Preliminaries: Extensions, Imports, and Database Connections

In [1]:
%%bash
# delete old copy of the database file
rm -rf CourseDataWarehouse.db

In [2]:
# Load %%sql magic
%load_ext sql

import pandas as pd
import sqlite3

# Create/Connect to CDW
%sql sqlite:///CourseDataWarehouse.db

## 1. Create Dimension Tables from ERDs

The following tables work for all the ERD designs. 

In [3]:
%%sql

-- Program Dimension
DROP TABLE IF EXISTS PROGRAMS_DIM;
CREATE TABLE PROGRAMS_DIM (
    ProgramID INTEGER PRIMARY KEY,
    ProgCode TEXT NOT NULL,
    ProgName TEXT NOT NULL,
    School TEXT,
    Aliases TEXT DEFAULT ''
);
CREATE INDEX ix_program_name on PROGRAMS_DIM (ProgCode);

-- Location Dimension
DROP TABLE IF EXISTS LOCATIONS_DIM;
CREATE TABLE LOCATIONS_DIM (
    LocationID INTEGER PRIMARY KEY,
    LocationCode TEXT NOT NULL,
    Building TEXT,
    Room TEXT,
    Capacity INTEGER DEFAULT 0,
    SeatsMax INTEGER DEFAULT 0
);

-- Time Segment Utility Dimension
-- Each time segment is 5 minutes long
-- Used to divide up timecodes into segments
DROP TABLE IF EXISTS TIME_SEGMENTS_UTIL;
CREATE TABLE TIME_SEGMENTS_UTIL (
    TimeSegmentID INTEGER PRIMARY KEY,
    StartSegTime TEXT NOT NULL,
    EndSegTime TEXT NOT NULL
);

-- Days of Week Utility Dimension
-- Used to expand DayCode to human readable names
DROP TABLE IF EXISTS DAYS_OF_WEEK_UTIL;
CREATE TABLE DAYS_OF_WEEK_UTIL (
    DayCode TEXT PRIMARY KEY,
    DayShort TEXT NOT NULL,
    DayLong TEXT NOT NULL
);

-- TimeCode Dimension
DROP TABLE IF EXISTS TIMECODES_DIM;
CREATE TABLE TIMECODES_DIM (
    TimeCodeID INTEGER PRIMARY KEY,
    DayCode TEXT NOT NULL,
    DayNum INTEGER NOT NULL,
    StartTime TEXT,
    EndTime TEXT,
    DurationMins INTEGER,
    FOREIGN KEY (DayCode) REFERENCES DAY_OF_WEEK_UTIL (DayCode)
);
CREATE INDEX ix_timecode on TIMECODES_DIM (DayCode,StartTime,EndTime);

-- TIMECODE_SEGMENTS_INTERSECT
DROP TABLE IF EXISTS TIMECODE_SEGMENTS_INTERSECT;
CREATE TABLE TIMECODE_SEGMENTS_INTERSECT (
    TimeCodeID INTEGER NOT NULL,
    TimeSegmentID INTEGER NOT NULL,
    PRIMARY KEY (TimeCodeID,TimeSegmentID),
    FOREIGN KEY (TimeCodeID) REFERENCES TIMECODES_DIM (TimeCodeID),
    FOREIGN KEY (TimeSegmentID) REFERENCES TIME_SEGMENTS_UTIL (TimeSegmentID)
);

-- Term Dimension
DROP TABLE IF EXISTS TERMS_DIM;
CREATE TABLE TERMS_DIM (
    TermID INTEGER PRIMARY KEY,
    TermCode TEXT NOT NULL,
    TermName TEXT NOT NULL,
    TermSeq INTEGER DEFAULT 0,
    CalendarYear INTEGER,
    SchoolYearEnd INTEGER,
    SchoolYearText TEXT NOT NULL
);

-- Instructor Dimension
DROP TABLE IF EXISTS INSTRUCTORS_DIM;
CREATE TABLE INSTRUCTORS_DIM (
    InstructorID INTEGER PRIMARY KEY,
    InstructorName TEXT NOT NULL,
    instructorEmail TEXT
);


-- Course Offering Dimension
DROP TABLE IF EXISTS COURSE_OFFERINGS_DIM;
CREATE TABLE COURSE_OFFERINGS_DIM (
    CourseOfferingID INTEGER PRIMARY KEY,
    CRN INTEGER NOT NULL,
    TermCode TEXT NOT NULL,
    CourseCode TEXT NOT NULL,
    Section TEXT NOT NULL,
    CourseTitle TEXT NOT NULL,
    Credits REAL,
    NumStudents INTEGER NOT NULL,
    CapStudents INTEGER NOT NULL,
    ScheduleSpec TEXT,
    DeliveryStyle TEXT,
    ScheduleType TEXT
);





 * sqlite:///CourseDataWarehouse.db
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.


[]

## 2. Populate Dimension Tables

### Conforming Dimensions from CSV Files

These are slow-moving, fairly small dimensions that are defined in advance.  

Utility tables are used to extend the dimensions with optional details. They are typically only joined in when needed.

In [6]:
# sqlAlchemy database connection
conn = sqlite3.connect('CourseDataWarehouse.db')

# Import Programs dimension data from CSV file
prog_df = pd.read_csv('SourceData/Dimensions/ProgramsDim.csv')
prog_df.to_sql("PROGRAMS_IMPORT",conn,if_exists="replace", index=False)

# Import the TeachingMode dimension from CSV file
modes_df = pd.read_csv('SourceData/Dimensions/TeachingModesDim.csv')
modes_df.to_sql("TEACHING_MODES_IMPORT",conn,if_exists="replace", index=False)

# Import the Location dimension from CSV file
locations_df = pd.read_csv('SourceData/Dimensions/LocationsDim.csv')
locations_df.to_sql("LOCATIONS_IMPORT",conn,if_exists="replace", index=False)

# Import the Time Segment utility dimension from CSV file
locations_df = pd.read_csv('SourceData/Dimensions/TimeSegmentsUtil.csv')
locations_df.to_sql("TIME_SEGMENTS_IMPORT",conn,if_exists="replace", index=False)

# Import the Days of Week utility dimension from CSV file
locations_df = pd.read_csv('SourceData/Dimensions/DaysOfWeekUtil.csv')
locations_df.to_sql("DAYS_OF_WEEK_IMPORT",conn,if_exists="replace", index=False)

# Import the course catalog dimension from CSV file
# catalog_df = pd.read_csv('SourceData/Dimensions/CoursesDim.csv')
# catalog_df.to_sql("CATALOG_IMPORT",conn,if_exists="replace", index=False)

7

In [7]:
%%sql 
-- Populate the PROGRAMS_DIM table from the import
DELETE FROM PROGRAMS_DIM;
INSERT INTO PROGRAMS_DIM (ProgCode, ProgName, School)
  SELECT ProgramCode, ProgramName, School FROM PROGRAMS_IMPORT;
SELECT * FROM PROGRAMS_DIM LIMIT 10;

 * sqlite:///CourseDataWarehouse.db
Done.
177 rows affected.
Done.


ProgramID,ProgCode,ProgName,School,Aliases
1,AC,Accounting,DSB,
2,ACCT,Accounting,DSB,
3,AMST,American Studies,CAS,
4,AS,American Studies,CAS,
5,ASAH,American Studies,CAS,
6,ASEN,American Studies,CAS,
7,ASHI,American Studies,CAS,
8,ASMU,American Studies,CAS,
9,ASRS,American Studies,CAS,
10,ASSO,American Studies,CAS,


In [8]:
%%sql
DELETE FROM LOCATIONS_DIM;
INSERT INTO LOCATIONS_DIM (LocationCode,Room,Building,Capacity,SeatsMax) 
  SELECT LocationCode, Room, Building, Capacity, SeatsMax
  FROM LOCATIONS_IMPORT 
  ORDER BY Building,Room;
SELECT * FROM LOCATIONS_DIM LIMIT 50;

 * sqlite:///CourseDataWarehouse.db
0 rows affected.
247 rows affected.
Done.


LocationID,LocationCode,Building,Room,Capacity,SeatsMax
1,BD,,,0,6
2,BH,,,0,11
3,BH BY ARR,,,0,16
4,BPS,,,0,10
5,BW,,,0,8
6,BY ARR,,,0,35
7,BY ARR X,,,0,30
8,CMC,,,0,6
9,DH,,,25,25
10,FPS,,,0,10


In [9]:
%%sql
DELETE FROM TIME_SEGMENTS_UTIL;
INSERT INTO TIME_SEGMENTS_UTIL (StartSegTime, EndSegTime)
  SELECT substr('0'||StartSegTime,-5), substr('0'||EndSegTime,-5)
  FROM TIME_SEGMENTS_IMPORT;
SELECT * FROM TIME_SEGMENTS_UTIL LIMIT 10;

 * sqlite:///CourseDataWarehouse.db
0 rows affected.
167 rows affected.
Done.


TimeSegmentID,StartSegTime,EndSegTime
1,08:00,08:05
2,08:05,08:10
3,08:10,08:15
4,08:15,08:20
5,08:20,08:25
6,08:25,08:30
7,08:30,08:35
8,08:35,08:40
9,08:40,08:45
10,08:45,08:50


In [10]:
%%sql
DELETE FROM DAYS_OF_WEEK_UTIL;
INSERT INTO DAYS_OF_WEEK_UTIL (DayCode, DayShort, DayLong) 
  SELECT DayCode, DayShort, DayLong 
  FROM DAYS_OF_WEEK_IMPORT;
SELECT * FROM DAYS_OF_WEEK_UTIL;

 * sqlite:///CourseDataWarehouse.db
0 rows affected.
7 rows affected.
Done.


DayCode,DayShort,DayLong
U,Sun,Sunday
M,Mon,Monday
T,Tue,Tuesday
W,Wed,Wednesday
R,Thu,Thursday
F,Fri,Friday
S,Sat,Saturday


In [11]:
%%sql 

-- DROP THE IMPORT TABLES
DROP TABLE PROGRAMS_IMPORT;
DROP TABLE LOCATIONS_IMPORT;
DROP TABLE TIME_SEGMENTS_IMPORT;
DROP TABLE DAYS_OF_WEEK_IMPORT;

 * sqlite:///CourseDataWarehouse.db
Done.
Done.
Done.
Done.


[]

### Conforming Dimensions from Banner data

These are fairly static and follow a general pattern but there are special ad hoc cases in the data. 
> For example, though there are only a couple dozen official timecodes listed in the calss catalog, there are over 1000 distinct timecodes found in the data. 

In [12]:
%sql ATTACH DATABASE 'CourseData.db' AS banner;

 * sqlite:///CourseDataWarehouse.db
Done.


[]

In [13]:
%%sql

-- Populate TIMECODES_DIM based on actual class meetings
DELETE FROM TIMECODES_DIM;
INSERT INTO TIMECODES_DIM (StartTime,EndTime,DayCode,DayNum, DurationMins)
  SELECT  DISTINCT
          time(StartDateTime) as s,
          time(EndDateTime) as e, 
          substr("UMTWRFS",strftime("%w",StartDateTime)+1,1) as 'DayCode',
          strftime("%w",StartDateTime) as 'DayNum',
          (strftime("%s",EndDateTime)-strftime("%s",StartDateTime))/60 as DurationMins 
  FROM banner.COURSE_MEETINGS;
SELECT * FROM TIMECODES_DIM LIMIT 10;



 * sqlite:///CourseDataWarehouse.db
Done.
1386 rows affected.
Done.


TimeCodeID,DayCode,DayNum,StartTime,EndTime,DurationMins
1,T,2,08:00:00,09:15:00,75
2,F,5,08:00:00,09:15:00,75
3,T,2,09:30:00,10:45:00,75
4,F,5,09:30:00,10:45:00,75
5,T,2,12:30:00,13:45:00,75
6,F,5,12:30:00,13:45:00,75
7,R,4,11:00:00,12:15:00,75
8,M,1,11:00:00,12:15:00,75
9,R,4,12:30:00,13:45:00,75
10,M,1,12:30:00,13:45:00,75


In [14]:
%%sql

-- Populate TERMS_DIM based on actual class offerings

DELETE FROM TERMS_DIM;
INSERT INTO TERMS_DIM (TermCode, TermName,TermSeq,CalendarYear,SchoolYearEnd,SchoolYearText)
    SELECT DISTINCT
        Term AS TermCode,
        substr(Term,1,length(Term)-4) AS TermName,
        CASE substr(Term,1,length(Term)-4)
          WHEN "Fall" THEN 1
          WHEN "Winter" THEN 2
          WHEN "Spring" THEN 3
          WHEN "SpringBreak" THEN 4
          WHEN "Summer" THEN 5
          ELSE 0
        END AS TermSeq,
        substr(Term,-4) AS CalendarYear,
        CASE
          WHEN Term LIKE "Fall%" OR Term LIKE "Summer%" THEN substr(Term,-4)+1
          ELSE substr(Term,-4)
        END AS SchoolYearEnd,
        CASE
          WHEN Term LIKE "Fall%" OR Term LIKE "Summer%" THEN substr(Term,-4) || "-" ||quote(substr(Term,-4)+1)
          ELSE quote(substr(Term,-4)-1) || "-" || substr(Term,-4)
        END AS SchoolYearText
    FROM banner.COURSE_OFFERINGS 
    ORDER BY SchoolYearEnd, TermCode;
SELECT * FROM TERMS_DIM;

 * sqlite:///CourseDataWarehouse.db
Done.
36 rows affected.
Done.


TermID,TermCode,TermName,TermSeq,CalendarYear,SchoolYearEnd,SchoolYearText
1,Fall2014,Fall,1,2014,2015,2014-2015
2,Fall2015,Fall,1,2015,2016,2015-2016
3,Summer2015,Summer,5,2015,2016,2015-2016
4,Fall2016,Fall,1,2016,2017,2016-2017
5,Summer2016,Summer,5,2016,2017,2016-2017
6,Fall2017,Fall,1,2017,2018,2017-2018
7,Summer2017,Summer,5,2017,2018,2017-2018
8,Fall2018,Fall,1,2018,2019,2018-2019
9,Summer2018,Summer,5,2018,2019,2018-2019
10,Fall2019,Fall,1,2019,2020,2019-2020


In [15]:
%%sql

-- Populate INSTRUCTORS_DIM from actual course offerings
DELETE FROM INSTRUCTORS_DIM;
INSERT INTO INSTRUCTORS_DIM (InstructorName)
  SELECT DISTINCT Name FROM banner.INSTRUCTORS;
SELECT * FROM INSTRUCTORS LIMIT 20;

 * sqlite:///CourseDataWarehouse.db
Done.
1570 rows affected.
Done.


InstructorID,Name
1,Michael P. Coyne
2,Rebecca I. Bloch
3,Paul Caster
4,Jo Ann Drusbosky
5,Arleen N. Kardos
6,Scott M Brenner
7,Kevin C. Cassidy
8,Bruce Bradford
9,Milo W. Peck
10,Stephen E. Yost


In [16]:
%%sql

-- Populate TIMECODE_SEGMENTS_INTERSECT
DELETE FROM TIMECODE_SEGMENTS_INTERSECT;
INSERT INTO TIMECODE_SEGMENTS_INTERSECT (TimeCodeID,TimeSegmentID)
  SELECT TimeCodeID,TimeSegmentID 
  FROM TIMECODES_DIM AS tc
      JOIN TIME_SEGMENTS_UTIL AS ts ON (time(ts.StartSegTime) >= tc.StartTime AND time(ts.EndSegTime) <= tc.EndTime);

SELECT * FROM TIMECODE_SEGMENTS_INTERSECT JOIN TIME_SEGMENTS_UTIL USING (TimeSegmentID) LIMIT 20;  

 * sqlite:///CourseDataWarehouse.db
Done.
53967 rows affected.
Done.


TimeCodeID,TimeSegmentID,StartSegTime,EndSegTime
894,1,08:00,08:05
891,1,08:00,08:05
384,1,08:00,08:05
244,1,08:00,08:05
214,1,08:00,08:05
749,1,08:00,08:05
1218,1,08:00,08:05
1246,1,08:00,08:05
755,1,08:00,08:05
1256,1,08:00,08:05


In [17]:
%%sql
SELECT * FROM banner.COURSE_OFFERINGS LIMIT 10;

 * sqlite:///CourseDataWarehouse.db
Done.


CourseOfferingID,CourseID,CatalogID,Term,CRN,Section,Credits,Title,Timecodes,PrimaryInstructorID,Instructors,Capacity,Actual,Remaining
1,1,AC 0011,Fall2014,70384,C01,3.0,Introduction to Financial Accounting,['TF 0800am-0915am 09/02-12/08 DSB 105'],1,,0,31,-31
2,1,AC 0011,Fall2014,70385,C02,3.0,Introduction to Financial Accounting,['TF 0930am-1045am 09/02-12/08 DSB 105'],1,,0,31,-31
3,1,AC 0011,Fall2014,70382,C03,3.0,Introduction to Financial Accounting,['TF 1230pm-0145pm 09/02-12/08 DSB 105'],1,,0,31,-31
4,1,AC 0011,Fall2014,70291,C04,3.0,Introduction to Financial Accounting,['MR 1100am-1215pm 09/02-12/08 DSB 111'],2,,0,29,-29
5,1,AC 0011,Fall2014,70350,C05,3.0,Introduction to Financial Accounting,['MR 1230pm-0145pm 09/02-12/08 DSB 111'],2,,0,30,-30
6,1,AC 0011,Fall2014,70381,C06,3.0,Introduction to Financial Accounting,['MR 0330pm-0445pm 09/02-12/08 DSB 110A'],2,,0,31,-31
7,1,AC 0011,Fall2014,70383,G,3.0,Introduction to Financial Accounting,['MR 0930am-1045am 09/02-12/08 DSB 111'],3,,30,31,-1
8,1,AC 0011,Fall2014,70391,H,3.0,Introduction to Financial Accounting,['TF 0930am-1045am 09/02-12/08 DSB 110A'],4,,30,32,-2
9,1,AC 0011,Fall2014,71105,I,3.0,Introduction to Financial Accounting,['TF 1100am-1215pm 09/02-12/08 DSB 110A'],4,,30,33,-3
10,1,AC 0011,Fall2014,71123,J,3.0,Introduction to Financial Accounting,['TF 0200pm-0315pm 09/02-12/08 DSB 105'],4,,30,32,-2


In [18]:
%%sql

-- Populate COURSE_OFFERINGS_DIM from banner data
DELETE FROM COURSE_OFFERINGS_DIM;
INSERT INTO COURSE_OFFERINGS_DIM (CRN,TermCode,CourseCode,Section,CourseTitle,Credits,NumStudents,CapStudents,ScheduleSpec,DeliveryStyle,ScheduleType)
  SELECT DISTINCT CRN,Term, CatalogID, Section, Title,Credits, Actual, Capacity, 
        CASE  
          WHEN Timecodes IS NULL OR Timecodes LIKE '%[]%' THEN NULL 
          ELSE Timecodes
        END as ScheduleSpec,
        CASE 
          WHEN Title LIKE '%IND%STUDY%' THEN 'Ind Study'
          WHEN Title LIKE '%INTERN%' THEN 'Internship'
          WHEN Title LIKE '%SEM%' THEN 'Seminar'
          WHEN Title LIKE '%PRACTICUM%' THEN 'Practicum'
          WHEN Timecodes IS NULL OR Timecodes LIKE '%[]%' THEN 'Online' 
          ELSE 'Classroom'
        END as DeliveryStyle,
        CASE  
          WHEN Timecodes IS NULL OR Timecodes LIKE '%[]%' THEN 'Async' 
          ELSE 'Sync'
        END as ScheduleType

  FROM banner.COURSE_OFFERINGS;
SELECT * FROM COURSE_OFFERINGS_DIM LIMIT 10;

 * sqlite:///CourseDataWarehouse.db
Done.
28971 rows affected.
Done.


CourseOfferingID,CRN,TermCode,CourseCode,Section,CourseTitle,Credits,NumStudents,CapStudents,ScheduleSpec,DeliveryStyle,ScheduleType
1,70384,Fall2014,AC 0011,C01,Introduction to Financial Accounting,3.0,31,0,['TF 0800am-0915am 09/02-12/08 DSB 105'],Classroom,Sync
2,70385,Fall2014,AC 0011,C02,Introduction to Financial Accounting,3.0,31,0,['TF 0930am-1045am 09/02-12/08 DSB 105'],Classroom,Sync
3,70382,Fall2014,AC 0011,C03,Introduction to Financial Accounting,3.0,31,0,['TF 1230pm-0145pm 09/02-12/08 DSB 105'],Classroom,Sync
4,70291,Fall2014,AC 0011,C04,Introduction to Financial Accounting,3.0,29,0,['MR 1100am-1215pm 09/02-12/08 DSB 111'],Classroom,Sync
5,70350,Fall2014,AC 0011,C05,Introduction to Financial Accounting,3.0,30,0,['MR 1230pm-0145pm 09/02-12/08 DSB 111'],Classroom,Sync
6,70381,Fall2014,AC 0011,C06,Introduction to Financial Accounting,3.0,31,0,['MR 0330pm-0445pm 09/02-12/08 DSB 110A'],Classroom,Sync
7,70383,Fall2014,AC 0011,G,Introduction to Financial Accounting,3.0,31,30,['MR 0930am-1045am 09/02-12/08 DSB 111'],Classroom,Sync
8,70391,Fall2014,AC 0011,H,Introduction to Financial Accounting,3.0,32,30,['TF 0930am-1045am 09/02-12/08 DSB 110A'],Classroom,Sync
9,71105,Fall2014,AC 0011,I,Introduction to Financial Accounting,3.0,33,30,['TF 1100am-1215pm 09/02-12/08 DSB 110A'],Classroom,Sync
10,71123,Fall2014,AC 0011,J,Introduction to Financial Accounting,3.0,32,30,['TF 0200pm-0315pm 09/02-12/08 DSB 105'],Classroom,Sync


# 3. Create and Populate Fact Tables

### The `CLASS_MEETING_FACTS` Table

Each row of the `CLASS_MEETING_FACTS` table represents a single class meeting for a specific course offering. If you were to poke your head into a crowded classroom, this schema represents the contextual kinds of questions one could ask about the class. Course offerings without class meetings (i.e., online classes or independent studies) are each represented with a single row without a date, time code, or location. 

> This is the finest granularity possible without either adding new dimensions or splitting the class meeting dimension into smaller time segments. 

**We'll start with the table DDL. To allow maximum flexibility for all the possible situations, only the `CourseOfferingID` is required.**

In [19]:
%%sql

-- The CLASS_MEETING_FACTS table
DROP TABLE IF EXISTS CLASS_MEETING_FACTS;
CREATE TABLE CLASS_MEETING_FACTS (
  ClassMeetingFactID INTEGER PRIMARY KEY,
  CourseOfferingID INTEGER NOT NULL,
  ProgramID INTEGER,
  PrimaryInstructorID INTEGER,
  LocationID INTEGER,
  TimeCodeID INTEGER,
  TermID INTEGER,
  ClassDate TEXT,
  FOREIGN KEY (CourseOfferingID) REFERENCES COURSE_OFFERINGS_DIM (CourseOfferingID),
  FOREIGN KEY (ProgramID) REFERENCES PROGRAMS_DIM (ProgramID),
  FOREIGN KEY (PrimaryInstructorID) REFERENCES INSTRUCTORS_DIM (InstructorID),
  FOREIGN KEY (LocationID) REFERENCES LOCATIONS_DIM (LocationID),
  FOREIGN KEY (TimeCodeID) REFERENCES TIME_CODES_DIM (TimeCodeID),
  FOREIGN KEY (TermID) REFERENCES TERMS_DIM (TermID)
);

 * sqlite:///CourseDataWarehouse.db
Done.
Done.


[]

**Populate with the data ...**

In [20]:
%%sql

DELETE FROM CLASS_MEETING_FACTS;
INSERT INTO CLASS_MEETING_FACTS (CourseOfferingID, ProgramID, PrimaryInstructorID, TimeCodeID, TermID, ClassDate, LocationID) 
  SELECT COURSE_OFFERINGS_DIM.CourseOfferingID, PROGRAMS_DIM.ProgramID,INSTRUCTORS_DIM.InstructorID, TimeCodeID, TermID, date(StartDateTime) AS ClassDate,LOCATIONS_DIM.LocationID as LocationID 
  FROM COURSE_OFFERINGS_DIM
    LEFT JOIN banner.COURSE_OFFERINGS ON (banner.COURSE_OFFERINGS.Term = COURSE_OFFERINGS_DIM.TermCode AND banner.COURSE_OFFERINGS.CRN = COURSE_OFFERINGS_DIM.CRN)
    LEFT JOIN banner.COURSE_MEETINGS ON (banner.COURSE_OFFERINGS.CourseOfferingID = banner.COURSE_MEETINGS.CourseOfferingID)
    LEFT JOIN banner.LOCATIONS USING (LocationID)
    LEFT JOIN banner.INSTRUCTORS ON (banner.INSTRUCTORS.InstructorID = banner.COURSE_OFFERINGS.PrimaryInstructorID)
    LEFT JOIN TIMECODES_DIM ON (strftime("%w",StartDateTime) = DayNum AND time(StartDateTime) = StartTime AND time(EndDateTime) = EndTime)
    LEFT JOIN TERMS_DIM USING (TermCode)
    LEFT JOIN INSTRUCTORS_DIM ON (INSTRUCTORS_DIM.InstructorName = INSTRUCTORS.Name)
    LEFT JOIN LOCATIONS_DIM USING (LocationCode)
    LEFT JOIN PROGRAMS_DIM ON (substr(COURSE_OFFERINGS_DIM.CourseCode,1,instr(COURSE_OFFERINGS_DIM.CourseCode," ")-1)=ProgCode);
 

 * sqlite:///CourseDataWarehouse.db
0 rows affected.
509152 rows affected.


[]

### The `COURSE_SECTION_FACTS` Table

This version is a rollup of the `CLASS_MEETING_FACTS` table. It is less fine grained but easier to work with as a data mart. To account for the missing class meeting data, it includes a few time-related aggregate measures.  

In [21]:
%%sql

-- The COURSE_SECTION_FACTS table
DROP TABLE IF EXISTS COURSE_SECTION_FACTS;
CREATE TABLE COURSE_SECTION_FACTS (
  CourseSectionFactID INTEGER PRIMARY KEY,
  CourseOfferingID INTEGER NOT NULL,
  ProgramID INTEGER,
  PrimaryInstructorID INTEGER,
  TermID INTEGER,
  FirstClass TEXT,
  LastClass TEXT,
  NumMeetings INTEGER,
  MeetingHours REAL,
  FOREIGN KEY (CourseOfferingID) REFERENCES COURSE_OFFERINGS_DIM (CourseOfferingID),
  FOREIGN KEY (ProgramID) REFERENCES PROGRAMS_DIM (ProgramID),
  FOREIGN KEY (PrimaryInstructorID) REFERENCES INSTRUCTORS_DIM (InstructorID),
  FOREIGN KEY (TermID) REFERENCES TERMS_DIM (TermID)
);

 * sqlite:///CourseDataWarehouse.db
Done.
Done.


[]

In [22]:
%%sql
DELETE FROM COURSE_SECTION_FACTS;
INSERT INTO COURSE_SECTION_FACTS (CourseOfferingID,ProgramID,PrimaryInstructorID,TermID,FirstClass,LastClass,NumMeetings,MeetingHours)
  SELECT CourseOfferingID,ProgramID,PrimaryInstructorID,TermID,
        min(ClassDate) AS StartDate, -- min(date(StartDateTime))
        max(ClassDate) AS EndDate,   -- max(date(StartDateTime))
        count(ClassDate) AS NumMeetings,
        coalesce(sum(DurationMins)/60,0.0) AS MeetingHours
  FROM CLASS_MEETING_FACTS
      LEFT JOIN TIMECODES_DIM USING (TimeCodeID)
  GROUP BY CourseOfferingID,ProgramID,PrimaryInstructorID,TermID
  ORDER BY CourseOfferingID;

SELECT * FROM COURSE_SECTION_FACTS LIMIT 10;

 * sqlite:///CourseDataWarehouse.db
0 rows affected.
28971 rows affected.
Done.


CourseSectionFactID,CourseOfferingID,ProgramID,PrimaryInstructorID,TermID,FirstClass,LastClass,NumMeetings,MeetingHours
1,1,1,1086,1,2014-09-02,2014-12-05,26,32.0
2,2,1,1086,1,2014-09-02,2014-12-05,26,32.0
3,3,1,1086,1,2014-09-02,2014-12-05,26,32.0
4,4,1,1263,1,2014-09-04,2014-12-08,26,32.0
5,5,1,1263,1,2014-09-04,2014-12-08,26,32.0
6,6,1,1263,1,2014-09-04,2014-12-08,26,32.0
7,7,1,1203,1,2014-09-04,2014-12-08,26,32.0
8,8,1,647,1,2014-09-02,2014-12-05,26,32.0
9,9,1,647,1,2014-09-02,2014-12-05,26,32.0
10,10,1,647,1,2014-09-02,2014-12-05,26,32.0


## 4. Integrity Checks

### Domain Integrity
SQLite data types are pretty limited, so there is not much to see here. A few specific value errors were corrected on import. A few more were found while populating tables. 

### Entity Integrity

In [23]:
%%sql
-- There should be 28971 Course Offerings, and 503819 Course Meetings
SELECT 
    (SELECT Count(*) FROM COURSE_OFFERINGS) as CourseOfferings,
    (SELECT Count(*) FROM COURSE_MEETINGS) as CourseMeetings;

 * sqlite:///CourseDataWarehouse.db
Done.


CourseOfferings,CourseMeetings
28971,503819


### Relational Integrity

In [24]:
%%sql 
-- A sanity check of the central star schema relationships
SELECT DISTINCT SchoolYearText,TERMS_DIM.TermCode,CRN, Section, COURSE_OFFERINGS_DIM.CourseCode, CourseTitle,LocationCode,CapStudents, NumStudents
FROM CLASS_MEETING_FACTS 
    LEFT JOIN PROGRAMS_DIM USING (ProgramID)
    LEFT JOIN TERMS_DIM USING (TermID)
    LEFT JOIN INSTRUCTORS_DIM ON (PrimaryInstructorID = InstructorID)
    LEFT JOIN LOCATIONS_DIM USING (LocationID)
    LEFT JOIN TIMECODES_DIM USING (TimeCodeID)
    LEFT JOIN COURSE_OFFERINGS_DIM USING (CourseOfferingID)
WHERE InstructorName like '%Huntley'
ORDER BY SchoolYearEnd, TermSeq, CourseTitle;

 * sqlite:///CourseDataWarehouse.db
Done.


SchoolYearText,TermCode,CRN,Section,CourseCode,CourseTitle,LocationCode,CapStudents,NumStudents
2014-2015,Fall2014,73060,A,IS 0135,Fundamentals of Web Design,DSB 107,25,26
2014-2015,Fall2014,70369,E,IS 0100,Intro to Information Systems,DSB 109,25,26
2014-2015,Fall2014,73061,A,IS 0320,Systems Design and Implementation,DSB 106,25,16
2015-2016,Fall2015,76388,A,IS 0135,Fundamentals of Web Design,DSB 107,25,21
2015-2016,Fall2015,75231,E,IS 0100,Intro to Information Systems,DSB 114,29,28
2015-2016,Fall2015,75246,F,IS 0100,Intro to Information Systems,DSB 114,29,28
2015-2016,Fall2015,76389,A,IS 0320,Systems Design and Implementation,DSB 109,25,13
2015-2016,Spring2016,38780,01,IS 0585,Contemporary Topics: Information Systems and Data,DSB 108,20,15
2015-2016,Spring2016,37253,B,OM 0101,Operations Management,DSB 110B,29,28
2015-2016,Spring2016,37254,C,OM 0101,Operations Management,DSB 111,29,29


## 5. Cleanup

In [25]:
%%sql

-- Detach from the banner database
DETACH DATABASE banner;

-- Force SQLite to rebuild the database file to minimize file size
vacuum;

 * sqlite:///CourseDataWarehouse.db
Done.
Done.


[]