# CourseData ETL

## STEP 1: Load SQL and create CourseData database

In [1]:
%load_ext sql
import pandas as pd
import sqlite3

In [2]:
%sql sqlite:///CourseData.db
conn = sqlite3.connect('CourseData.db')

## STEP 2: Create tables as designed in Lucidchart

In [3]:
%%sql

DROP TABLE IF EXISTS PROFESSORS;
CREATE TABLE PROFESSORS (
    Professor_id INTEGER PRIMARY KEY,
    Name TEXT NOT NULL
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [4]:
%%sql
DROP TABLE IF EXISTS PROGRAMS;
CREATE TABLE PROGRAMS (
    Program_id INTEGER PRIMARY KEY,
    program_code TEXT(2) NOT NULL,
    program_name TEXT NOT NULL
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [5]:
%%sql
DROP TABLE IF EXISTS LOCATIONS;
CREATE TABLE LOCATIONS (
    Location_id INTEGER PRIMARY KEY,
    location TEXT NOT NULL
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [6]:
%%sql
DROP TABLE IF EXISTS MEETINGS;
CREATE TABLE MEETINGS (
    Meetings_id INTEGER PRIMARY KEY, 
    Meetings TEXT,
    Day TEXT,
    Start TEXT NOT NULL,
    End TEXT NOT NULL,
    Location_id INTEGER NOT NULL,
    FOREIGN KEY (Location_id) REFERENCES LOCATIONS(Location_id)
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [7]:
%%sql
DROP TABLE IF EXISTS COURSES;
CREATE TABLE COURSES (
    Course_id INTEGER PRIMARY KEY, 
    Catalog_id TEXT NOT NULL,
    Term TEXT NOT NULL,
    Title TEXT NOT NULL,
    Credits TEXT NOT NULL,
    Attributes TEXT,
    Prereqs TEXT,
    Coreqs TEXT,
    Description TEXT,
    Fee TEXT,
    Program_id INTEGER NOT NULL,
    FOREIGN KEY (Program_id) REFERENCES PROGRAMS(Program_id)
);

 * sqlite:///CourseData.db
Done.
Done.


[]

In [8]:
%%sql
DROP TABLE IF EXISTS COURSE_OFFERINGS;
CREATE TABLE COURSE_OFFERINGS (
    Offering_id INTEGER PRIMARY KEY,
    Cap INTEGER NOT NULL,
    Section TEXT NOT NULL,
    Crn INTEGER NOT NULL,
    Actual INTEGER NOT NULL,
    Remaining INTEGER NOT NULL,
    Timecodes TEXT,
    Course_id INTEGER NOT NULL,
    Professor_id INTEGER,
    Meetings_id INTEGER NOT NULL,
    FOREIGN KEY (Course_id) REFERENCES COURSES(Course_id),
    FOREIGN KEY (Professor_id) REFERENCES PROFESSORS(Professor_id),
    FOREIGN KEY (Meetings_id) REFERENCES MEETINGS(Meetings_id)
);

 * sqlite:///CourseData.db
Done.
Done.


[]

## STEP 3: Import CSV files

In [9]:
# importing course catalog csvs (taken from professsor's example )
course_catalog = ['2017_2018', '2018_2019']

# loop using pandas
for cat_year in course_catalog:
    path = 'SourceData/Catalogs/CourseCatalog'+cat_year+'.csv' # path to directory
    data = pd.read_csv(path) # read all csvs in path
    data['Term'] = cat_year # add new column 'Term' to differentiate between 2017-18, 2018-19
    data.to_sql('IMPORT_CATALOG_COURSES',conn,if_exists='append',index=False) # converting to sql & unique

In [10]:
# meetings df

semesters = ['Fall2014','Fall2015','Fall2016','Fall2017','Fall2018','Spring2015', 'Spring2016','Spring2017','Spring2018','Spring2019','SpringBreak2017',
                   'Summer2015', 'Summer2016','Summer2017','Summer2018','Winter2015','Winter2016', 'Winter2017','Winter2018']

for semester in semesters: 
    path = 'SourceData/'+semester+'/course_meetings.csv'
    data = pd.read_csv(path)
    data.to_sql('IMPORT_COURSE_MEETINGS',conn,if_exists='append',index=False) 

In [11]:
# courses df
semesters = ['Fall2014','Fall2015','Fall2016','Fall2017','Fall2018','Spring2015', 'Spring2016','Spring2017','Spring2018','Spring2019','SpringBreak2017',
                   'Summer2015', 'Summer2016','Summer2017','Summer2018','Winter2015','Winter2016', 'Winter2017','Winter2018']

for semester in semesters:
    path = 'SourceData/'+semester+'/courses.csv'
    data = pd.read_csv(path)
    data.to_sql('IMPORT_COURSE_OFFERINGS',conn,if_exists='append',index=False) 

In [12]:
%%sql
--check for duplication
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_OFFERINGS) as 'Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_OFFERINGS)) as 'DistinctCount';

 * sqlite:///CourseData.db
Done.


Count,DistinctCount
15937,15937


In [13]:
%%sql
-- check for duplication in course meetings

SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_MEETINGS) as 'Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_MEETINGS)) as 'DistinctCount';

 * sqlite:///CourseData.db
Done.


Count,DistinctCount
284907,284847


In [14]:
%%sql
--check for duplication in catalog courses

SELECT 
    (SELECT Count(*) FROM IMPORT_CATALOG_COURSES) as 'Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_CATALOG_COURSES)) as 'DistinctCount';

 * sqlite:///CourseData.db
Done.


Count,DistinctCount
4440,4440


## STEP 4: POPULATING THE TABLES

In [15]:
%%sql
DELETE FROM PROFESSORS;
INSERT INTO PROFESSORS(Name)
    SELECT DISTINCT primary_instructor
    FROM IMPORT_COURSE_OFFERINGS;
SELECT * FROM PROFESSORS LIMIT 5;

 * sqlite:///CourseData.db
0 rows affected.
1104 rows affected.
Done.


Professor_id,Name
1,Michael P. Coyne
2,Rebecca I. Bloch
3,Paul Caster
4,Jo Ann Drusbosky
5,Arleen N. Kardos


In [16]:
%%sql
DELETE FROM PROGRAMS;
INSERT INTO PROGRAMS(program_code, program_name)
    SELECT DISTINCT program_code, program_name
    FROM IMPORT_CATALOG_COURSES;
SELECT * FROM PROGRAMS LIMIT 5;

 * sqlite:///CourseData.db
0 rows affected.
83 rows affected.
Done.


Program_id,program_code,program_name
1,AN,Asian Studies
2,BU,Business
3,BL,Black Studies
4,BEN,Bioengineering
5,AR,Arabic


In [17]:
%%sql
DELETE FROM LOCATIONS;
INSERT INTO LOCATIONS(location)
    SELECT DISTINCT location
    FROM IMPORT_COURSE_MEETINGS
    ORDER BY location;
SELECT * FROM LOCATIONS LIMIT 5;

 * sqlite:///CourseData.db
0 rows affected.
207 rows affected.
Done.


Location_id,location
1,BCC 200
2,BD
3,BH
4,BH BY ARR
5,BLM 112


In [18]:
%%sql
DELETE FROM COURSES;
INSERT INTO COURSES(Catalog_id, Term, Title, Credits, Attributes, Prereqs, Coreqs, Description, Fee, Program_id)
    SELECT DISTINCT Catalog_id, Term, course_title, credits, attributes, prereqs, coreqs, description, fees, Program_id 
    FROM IMPORT_CATALOG_COURSES
        JOIN PROGRAMS USING (Program_code);
SELECT * FROM COURSES LIMIT 2;

 * sqlite:///CourseData.db
0 rows affected.
4440 rows affected.
Done.


Course_id,Catalog_id,Term,Title,Credits,Attributes,Prereqs,Coreqs,Description,Fee,Program_id
1,AN 0301,2017_2018,Independent Study,1-3 Credits,,,,Students undertake an individualized program of study in consultation with a director from the Asian studies faculty.,,1
2,AN 0310,2017_2018,Asian Studies Seminar,3 Credits,,,,"This seminar examines selected topics concerning Asia. This course is taught in conjunction with another 100-300 level course from a rotation of course offerings. Consult the Asian Studies director to identify the conjoined course for a given semester. The seminar concentrates on topics within the parameters of the conjoined course syllabus but adds research emphasis. Students registered for this course must complete a research project, to include 300-level research, in addition to the regular research requirements of the conjoined course, and a 25-50 page term paper in substitution of some portion of the conjoined course requirements, as determined by the instructor. Open to juniors and seniors only.",,1


In [19]:
%%sql
DELETE FROM COURSE_OFFERINGS;
INSERT INTO COURSE_OFFERINGS(Offering_id, Cap, Section, Crn, Actual, Remaining, Timecodes, Course_id, Professor_id, Meetings_id)
    SELECT DISTINCT Offering_id, cap, section, crn, act, rem, timecodes, Course_id, Professor_id, Meetings_id 
    FROM IMPORT_COURSE_OFFERINGS
        JOIN COURSES USING (Catalog_id,Term) # join on cat_year & id
        JOIN PROGRAMS USING (Program_code) 
        JOIN PROFESSORS USING (Name);
SELECT * FROM COURSE_OFFERINGS LIMIT 5;

 * sqlite:///CourseData.db
0 rows affected.
(sqlite3.OperationalError) unrecognized token: "#"
[SQL: INSERT INTO COURSE_OFFERINGS(Offering_id, Cap, Section, Crn, Actual, Remaining, Timecodes, Course_id, Professor_id, Meetings_id)
    SELECT DISTINCT Offering_id, cap, section, crn, act, rem, timecodes, Course_id, Professor_id, Meetings_id 
    FROM IMPORT_COURSE_OFFERINGS
        JOIN COURSES USING (Catalog_id,Term) # join on cat_year & id
        JOIN PROGRAMS USING (Program_code) 
        JOIN PROFESSORS USING (Name);]
(Background on this error at: http://sqlalche.me/e/e3q8)


In [20]:
%%sql
DELETE FROM MEETINGS;
--INSERT INTO MEETINGS(Meetings, Day, Start, End, Location_id)
    SELECT DISTINCT meetings, day, start, end, Location_id
    FROM IMPORT_CATALOG_COURSES, 
        JOIN COURSES USING (term) 
        JOIN LOCATIONS ON (LOCATIONS.location_id = IMPORT_CATALOG_COURSES.location);
--SELECT * FROM MEETINGS LIMIT 5;  

 * sqlite:///CourseData.db
0 rows affected.
(sqlite3.OperationalError) near "JOIN": syntax error
[SQL: --INSERT INTO MEETINGS(Meetings, Day, Start, End, Location_id)
    SELECT DISTINCT meetings, day, start, end, Location_id
    FROM IMPORT_CATALOG_COURSES, 
        JOIN COURSES USING (term) 
        JOIN LOCATIONS ON (LOCATIONS.location_id = IMPORT_CATALOG_COURSES.location);]
(Background on this error at: http://sqlalche.me/e/e3q8)
