# CourseData ETL

## Step 1: Load SQL and create CourseData database

In [1]:
%load_ext sql
import pandas as pd
import sqlite3

In [2]:
%sql sqlite:///CourseData.db
conn = sqlite3.connect('CourseData.db')

## Step 2: Create tables as designed in ERD

In [3]:
%%sql

DROP TABLE IF EXISTS PROFESSORS;
CREATE TABLE PROFESSORS (
    Professor_id INTEGER PRIMARY KEY,
    Name TEXT NOT NULL
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [4]:
%%sql
DROP TABLE IF EXISTS PROGRAMS;
CREATE TABLE PROGRAMS (
    Program_id INTEGER PRIMARY KEY,
    program_code TEXT NOT NULL,
    program_name TEXT NOT NULL
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [5]:
%%sql
DROP TABLE IF EXISTS LOCATIONS;
CREATE TABLE LOCATIONS (
    Location_id INTEGER PRIMARY KEY,
    location TEXT NOT NULL
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [6]:
%%sql
DROP TABLE IF EXISTS COURSES;
CREATE TABLE COURSES (
    Course_id INTEGER PRIMARY KEY, 
    CatalogYear TEXT NOT NULL,
    Catalog_id TEXT NOT NULL,
    Course_title TEXT NOT NULL,
    Credits TEXT NOT NULL,
    Attributes TEXT,
    Prereqs TEXT,
    Coreqs TEXT,
    Description TEXT,
    Fee TEXT,
    Program_id INTEGER NOT NULL,
    FOREIGN KEY (Program_id) REFERENCES PROGRAMS(Program_id)
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [7]:
%%sql
DROP TABLE IF EXISTS COURSE_OFFERINGS;
CREATE TABLE COURSE_OFFERINGS (
    Offering_id INTEGER PRIMARY KEY,
    CatalogYear TEXT,
    Term TEXT NOT NULL,
    Section TEXT NOT NULL,
    Crn TEXT NOT NULL,
    Title TEXT NOT NULL,
    Credits REAL,
    Cap INTEGER NOT NULL,
    Actual INTEGER NOT NULL,
    Remaining INTEGER NOT NULL,
    Timecodes TEXT,
    Meetings TEXT,
    Course_id INTEGER,
    Professor_id INTEGER,
    FOREIGN KEY (Professor_id) REFERENCES PROFESSORS(Professor_id)
    FOREIGN KEY (Course_id) REFERENCES COURSES(Course_id)
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [8]:
%%sql
DROP TABLE IF EXISTS MEETINGS;
CREATE TABLE MEETINGS (
    Meetings_id INTEGER PRIMARY KEY,
    Day TEXT,
    Start TEXT NOT NULL,
    End TEXT NOT NULL,
    Location_id INTEGER NOT NULL,
    Offering_id INTEGER,
    FOREIGN KEY (Location_id) REFERENCES LOCATIONS(Location_id),
    FOREIGN KEY (Offering_id) REFERENCES COURSE_OFFERINGS(Offering_id)
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [9]:
%%sql
DROP TABLE IF EXISTS CATALOG_YEAR;
CREATE TABLE CATALOG_YEAR (
    CatalogYear TEXT,
    Term TEXT
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [10]:
%%sql
INSERT INTO Catalog_Year (Term, CatalogYear) VALUES 
('Fall2017','2017_2018'),
('Winter2018','2017_2018'),
('Spring2018','2017_2018'),
('Summer2018','2017_2018'),
('Fall2018','2018_2019'),
('Winter2019','2018_2019'),
('Spring2019','2018_2019');

 * sqlite:///CourseData.db
7 rows affected.


[]

In [11]:
%%sql
SELECT * FROM CATALOG_YEAR;

 * sqlite:///CourseData.db
Done.


CatalogYear,Term
2017_2018,Fall2017
2017_2018,Winter2018
2017_2018,Spring2018
2017_2018,Summer2018
2018_2019,Fall2018
2018_2019,Winter2019
2018_2019,Spring2019


## Step 3: Import CSV files

In [12]:
# importing course catalog csvs (taken from professsor's example )
catalog_years = ['2017_2018', '2018_2019']

# loop using pandas
for cat_year in catalog_years:
    path = 'SourceData/Catalogs/CourseCatalog'+cat_year+'.csv' # path to directory
    data = pd.read_csv(path) # read all csvs in path
    data['cat_year'] = cat_year # add new column 'Term' to differentiate between 2017-18, 2018-19
    data.to_sql('IMPORT_CATALOG_COURSES',conn,if_exists='append',index=False) # converting to sql & unique

In [13]:
# meetings df

semesters = ['Fall2014','Fall2015','Fall2016','Fall2017','Fall2018','Spring2015', 'Spring2016','Spring2017','Spring2018','Spring2019','SpringBreak2017',
                   'Summer2015', 'Summer2016','Summer2017','Summer2018','Winter2015','Winter2016', 'Winter2017','Winter2018']

for semester in semesters: 
    path = 'SourceData/'+semester+'/course_meetings.csv'
    data = pd.read_csv(path)
    data.to_sql('IMPORT_COURSE_MEETINGS',conn,if_exists='append',index=False) 

In [14]:
# courses df
semesters = ['Fall2014','Fall2015','Fall2016','Fall2017','Fall2018','Spring2015', 'Spring2016','Spring2017','Spring2018','Spring2019','SpringBreak2017',
                   'Summer2015', 'Summer2016','Summer2017','Summer2018','Winter2015','Winter2016', 'Winter2017','Winter2018']

for semester in semesters:
    path = 'SourceData/'+semester+'/courses.csv'
    data = pd.read_csv(path)
    data['cat_year'] = cat_year # add new column 'cat_year' to differentiate between 2017-18, 2018-19
    data.to_sql('IMPORT_COURSE_OFFERINGS',conn,if_exists='append',index=False) 

In [15]:
%%sql
--check for duplication
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_OFFERINGS) as 'Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_OFFERINGS)) as 'DistinctCount';

 * sqlite:///CourseData.db
Done.


Count,DistinctCount
15937,15937


In [16]:
%%sql
-- check for duplication in course meetings

SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_MEETINGS) as 'Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_MEETINGS)) as 'DistinctCount';

 * sqlite:///CourseData.db
Done.


Count,DistinctCount
284907,284847


In [17]:
%%sql
--check for duplication in catalog courses

SELECT 
    (SELECT Count(*) FROM IMPORT_CATALOG_COURSES) as 'Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_CATALOG_COURSES)) as 'DistinctCount';

 * sqlite:///CourseData.db
Done.


Count,DistinctCount
4440,4440


## Step 4: Populating the Tables

In [18]:
%%sql
DELETE FROM PROFESSORS;
INSERT INTO PROFESSORS(Name)
    SELECT DISTINCT primary_instructor
    FROM IMPORT_COURSE_OFFERINGS;
SELECT * FROM PROFESSORS LIMIT 5;

 * sqlite:///CourseData.db
0 rows affected.
1104 rows affected.
Done.


Professor_id,Name
1,Michael P. Coyne
2,Rebecca I. Bloch
3,Paul Caster
4,Jo Ann Drusbosky
5,Arleen N. Kardos


In [19]:
%%sql
DELETE FROM PROGRAMS;
INSERT INTO PROGRAMS(program_code, program_name)
    SELECT DISTINCT program_code, program_name
    FROM IMPORT_CATALOG_COURSES
    ORDER BY program_code;
SELECT * FROM PROGRAMS LIMIT 5;

 * sqlite:///CourseData.db
0 rows affected.
83 rows affected.
Done.


Program_id,program_code,program_name
1,AC,Accounting
2,AE,Applied Ethics
3,AH,Art History
4,AN,Asian Studies
5,AR,Arabic


In [20]:
%%sql
DELETE FROM COURSES;
INSERT INTO COURSES(CatalogYear, Catalog_id, Course_title, Credits, Attributes, Prereqs, Coreqs, Description, Fee, Program_id)
    SELECT DISTINCT cat_year, Catalog_id, course_title, credits, attributes, prereqs, coreqs, description, fees, Program_id 
    FROM IMPORT_CATALOG_COURSES
        JOIN PROGRAMS USING (program_code);
SELECT * FROM COURSES LIMIT 2;

 * sqlite:///CourseData.db
0 rows affected.
4440 rows affected.
Done.


Course_id,CatalogYear,Catalog_id,Course_title,Credits,Attributes,Prereqs,Coreqs,Description,Fee,Program_id
1,2017_2018,AN 0301,Independent Study,1-3 Credits,,,,Students undertake an individualized program of study in consultation with a director from the Asian studies faculty.,,4
2,2017_2018,AN 0310,Asian Studies Seminar,3 Credits,,,,"This seminar examines selected topics concerning Asia. This course is taught in conjunction with another 100-300 level course from a rotation of course offerings. Consult the Asian Studies director to identify the conjoined course for a given semester. The seminar concentrates on topics within the parameters of the conjoined course syllabus but adds research emphasis. Students registered for this course must complete a research project, to include 300-level research, in addition to the regular research requirements of the conjoined course, and a 25-50 page term paper in substitution of some portion of the conjoined course requirements, as determined by the instructor. Open to juniors and seniors only.",,4


In [21]:
%%sql
DELETE FROM LOCATIONS;
INSERT INTO LOCATIONS(location)
    SELECT DISTINCT location
    FROM IMPORT_COURSE_MEETINGS
    ORDER BY location;
SELECT * FROM LOCATIONS LIMIT 5;

 * sqlite:///CourseData.db
0 rows affected.
207 rows affected.
Done.


Location_id,location
1,BCC 200
2,BD
3,BH
4,BH BY ARR
5,BLM 112


In [22]:
%%sql
DELETE FROM COURSE_OFFERINGS;
INSERT INTO COURSE_OFFERINGS (CatalogYear, Term, Section, Crn, Title, Credits, Cap, Actual, Remaining, Timecodes, Meetings, Course_id, Professor_id)
    SELECT DISTINCT cat_year, term, section, crn, title, import_course_offerings.credits, cap, act, rem, timecodes, Meetings, Course_id, Professor_id
    FROM IMPORT_COURSE_OFFERINGS
        LEFT JOIN CATALOG_YEAR USING (Term)
        LEFT JOIN PROFESSORS ON (primary_instructor = PROFESSORS.Name)
        LEFT JOIN COURSES ON (IMPORT_COURSE_OFFERINGS.catalog_id = COURSES.Catalog_id AND COURSES.CatalogYear = CATALOG_YEAR.CatalogYear);
        
SELECT DISTINCT * FROM COURSE_OFFERINGS
LIMIT 2;

 * sqlite:///CourseData.db
0 rows affected.
15937 rows affected.
Done.


Offering_id,CatalogYear,Term,Section,Crn,Title,Credits,Cap,Actual,Remaining,Timecodes,Meetings,Course_id,Professor_id
1,2018_2019,Fall2014,C01,70384,Introduction to Financial Accounting,3.0,0,31,-31,['TF 0800am-0915am 09/02-12/08 DSB 105'],"[{'days': 'TF', 'times': '0800am-0915am', 'dates': '09/02-12/08', 'location': 'DSB 105'}]",,1
2,2018_2019,Fall2014,C02,70385,Introduction to Financial Accounting,3.0,0,31,-31,['TF 0930am-1045am 09/02-12/08 DSB 105'],"[{'days': 'TF', 'times': '0930am-1045am', 'dates': '09/02-12/08', 'location': 'DSB 105'}]",,1


In [23]:
%%sql
DELETE FROM MEETINGS;
INSERT INTO MEETINGS(Day, Start, End, Location_id, Offering_id)
    SELECT DISTINCT day, start, end, Location_id, Offering_id
    FROM IMPORT_COURSE_MEETINGS
        JOIN COURSE_OFFERINGS USING (Term, Crn) 
        LEFT JOIN LOCATIONS ON (IMPORT_COURSE_MEETINGS.Location = LOCATIONS.location);
SELECT * FROM MEETINGS LIMIT 5;

 * sqlite:///CourseData.db
0 rows affected.
284847 rows affected.
Done.


Meetings_id,Day,Start,End,Location_id,Offering_id
1,T,2014-09-02T08:00:00,2014-09-02T09:15:00,99,1
2,F,2014-09-05T08:00:00,2014-09-05T09:15:00,99,1
3,T,2014-09-09T08:00:00,2014-09-09T09:15:00,99,1
4,F,2014-09-12T08:00:00,2014-09-12T09:15:00,99,1
5,T,2014-09-16T08:00:00,2014-09-16T09:15:00,99,1


In [24]:
%%sql
SELECT Location_id, COUNT(*) FROM MEETINGS GROUP BY Location_id LIMIT 5; 

 * sqlite:///CourseData.db
Done.


Location_id,COUNT(*)
1,26
2,11
3,381
4,15
5,13


## Step 5: Clear and Regain Storage

In [None]:
%%sql
DELETE FROM IMPORT_CATALOG_COURSES;
DELETE FROM IMPORT_COURSE_OFFERINGS;
DELETE FROM IMPORT_COURSE_MEETINGS;

In [None]:
%%sql
DROP TABLE IMPORT_CATALOG_COURSES;
DROP TABLE IMPORT_COURSE_OFFERINGS;
DROP TABLE IMPORT_COURSE_MEETINGS;

In [None]:
%%sql
vacuum;