# CourseData ETL

## STEP 1: Load SQL and create CourseData database

In [16]:
%load_ext sql
import pandas as pd
import sqlite3

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [17]:
%sql sqlite:///CourseData.db
conn = sqlite3.connect('CourseData.db')

## STEP 2: Create tables as designed in Lucidchart

In [None]:
%%sql

DROP TABLE IF EXISTS PROFESSORS;
CREATE TABLE PROFESSORS (
    Professor_id INTEGER PRIMARY KEY,
    Name TEXT NOT NULL
);

In [None]:
%%sql
DROP TABLE IF EXISTS PROGRAMS;
CREATE TABLE PROGRAMS (
    Program_id INTEGER PRIMARY KEY,
    program_code TEXT(2) NOT NULL,
    program_name TEXT NOT NULL
);

In [None]:
%%sql
DROP TABLE IF EXISTS LOCATIONS;
CREATE TABLE LOCATIONS (
    Location_id INTEGER PRIMARY KEY,
    location TEXT NOT NULL
);

In [None]:
%%sql
DROP TABLE IF EXISTS MEETINGS;
CREATE TABLE MEETINGS (
    Meetings_id INTEGER PRIMARY KEY, 
    Meetings TEXT,
    Day TEXT,
    Start TEXT NOT NULL,
    End TEXT NOT NULL,
    Location_id INTEGER NOT NULL,
    FOREIGN KEY (Location_id) REFERENCES LOCATIONS(Location_id)
);

In [None]:
%%sql
DROP TABLE IF EXISTS COURSES;
CREATE TABLE COURSES (
    Course_id INTEGER PRIMARY KEY, 
    Catalog_id TEXT NOT NULL,
    Term TEXT NOT NULL,
    Title TEXT NOT NULL,
    Credits TEXT NOT NULL,
    Attributes TEXT,
    Prereqs TEXT NOT NULL,
    Coreqs TEXT,
    Description TEXT,
    Fee TEXT,
    Program_id INTEGER NOT NULL,
    FOREIGN KEY (Program_id) REFERENCES PROGRAMS(Program_id)
);

^^^ check with group to determine what type of data "credits" applies to

In [None]:
%%sql
DROP TABLE IF EXISTS COURSE_OFFERINGS;
CREATE TABLE COURSE_OFFERINGS (
    Offering_id INTEGER PRIMARY KEY, 
    Cap INTEGER NOT NULL,
    Section TEXT NOT NULL,
    Crn INTEGER NOT NULL,
    Actual INTEGER NOT NULL,
    Remaining INTEGER NOT NULL,
    Timecodes TEXT,
    Course_id INTEGER NOT NULL,
    Professor_id INTEGER NOT NULL,
    Meetings_id INTEGER NOT NULL,
    FOREIGN KEY (Course_id) REFERENCES COURSES(Course_id),
    FOREIGN KEY (Professor_id) REFERENCES PROFESSORS(Professor_id),
    FOREIGN KEY (Meetings_id) REFERENCES MEETINGS(Meetings_id)
);

## STEP 3: Import CSV files

In [18]:
# importing course catalog csvs (taken from professsor's example )
course_catalog = ['2017_2018', '2018_2019']

# loop using pandas
for cat_year in course_catalog:
    path = 'SourceData/Catalogs/CourseCatalog'+cat_year+'.csv' # path to directory
    data = pd.read_csv(path) # read all csvs in path
    data['Term'] = cat_year # add new column 'Term' to differentiate between 2017-18, 2018-19
    data.to_sql('IMPORT_CATALOG_COURSES',conn,if_exists='append',index=False) # converting to sql & unique

In [29]:
# meetings df

semesters = ['Fall2014','Fall2015','Fall2016','Fall2017','Fall2018','Spring2015', 'Spring2016','Spring2017','Spring2018','Spring2019','SpringBreak2017',
                   'Summer2015', 'Summer2016','Summer2017','Summer2018','Winter2015','Winter2016', 'Winter2017','Winter2018']

for semester in semesters: 
    path = 'SourceData/'+semesters+'/course_meetings.csv'
    data = pd.read_csv(path)
    data.to_sql('IMPORT_COURSE_MEETINGS',conn,if_exists='append',index=False) 

TypeError: must be str, not list

In [28]:
# courses df
semesters = ['Fall2014','Fall2015','Fall2016','Fall2017','Fall2018','Spring2015', 'Spring2016','Spring2017','Spring2018','Spring2019','SpringBreak2017',
                   'Summer2015', 'Summer2016','Summer2017','Summer2018','Winter2015','Winter2016', 'Winter2017','Winter2018']

for semester in semesters:
    path = 'SourceData/'+semesters+'/courses.csv'
    data = pd.read_csv(path)
    data.to_sql('IMPORT_COURSE_OFFERINGS',conn,if_exists='append',index=False) 

TypeError: must be str, not list