In [1]:
import pandas as pd 
import numpy as np
import sqlite3
conn = sqlite3.connect('CourseData2.db')
c = conn.cursor()

In [2]:
%load_ext sql

In [3]:
%%sql
sqlite:///CourseData2.db

'Connected: @CourseData2.db'

In [4]:
%%sql
--Dropping tables for when we need to restart and clear CourseData database-- 
--Drop ERD tables--
DROP TABLE IF EXISTS SECTION;
DROP TABLE IF EXISTS CATALOG;
DROP TABLE IF EXISTS INSTRUCTOR;
DROP TABLE IF EXISTS COURSE_MEETING;

--Drop import tables--
DROP TABLE IF EXISTS import_course;
DROP TABLE IF EXISTS import_course_meeting;
DROP TABLE IF EXISTS import_course_catalog;

 * sqlite:///CourseData2.db
Done.
Done.
Done.
Done.
Done.
Done.
Done.


[]

In [5]:
%%sql
--Creating import_catalog table-- 

CREATE TABLE 'import_course_catalog' (
    program_code TEXT NOT NULL,
    program_name VARCHAR NOT NULL,
    catalog_id VARCHAR NOT NULL,
    course_title VARCHAR NOT NULL, 
    credits VARCHAR NOT NULL,
    prereqs VARCHAR, 
    coreqs VARCHAR,
    fees VARCHAR,
    attributes VARCHAR,
    description VARCHAR 
);

 * sqlite:///CourseData2.db
Done.


[]

In [6]:
%%sql
--Creating import_course_meeting table-- 

CREATE TABLE 'import_course_meeting' (
    term VARCHAR NOT NULL,
    crn INTEGER NOT NULL,
    location VARCHAR NOT NULL,
    day TEXT NOT NULL,
    start VARCHAR NOT NULL,
    end VARCHAR NOT NULL 
);

 * sqlite:///CourseData2.db
Done.


[]

In [7]:
%%sql
--Creating import_course table-- 

CREATE TABLE 'import_course' (
    term VARCHAR NOT NULL,
    crn INTEGER NOT NULL,
    catalog_id VARCHAR NOT NULL,
    section VARCHAR NOT NULL,
    credits VARCHAR,
    title VARCHAR, 
    meetings VARCHAR,
    timecodes VARCHAR, 
    primary_instructor TEXT,
    cap INTEGER, 
    act INTEGER, 
    rem INTEGER 
);

 * sqlite:///CourseData2.db
Done.


[]

In [8]:
%%sql
-- Creating table INSTRUCTOR with surrogate primary key-- 

CREATE TABLE 'INSTRUCTOR' (
    InID INTEGER NOT NULL PRIMARY KEY,
    Name TEXT NOT NULL
);

--Creating table CATALOG with CatID as the primary key-- 

CREATE TABLE 'CATALOG'(
    CatID VARCHAR PRIMARY KEY,
    Title VARCHAR NOT NULL,
    Description TEXT NOT NULL,
    Credits TEXT NOT NULL,
    PName TEXT NOT NULL,
    PCode TEXT,
    PREREQS TEXT,
    COREQS TEXT,
    FEES INTEGER,
    Attribute TEXT
    
);

--Creating table SECTION with surrogate primary key-- 

CREATE TABLE 'SECTION'(
    SID INTEGER NOT NULL PRIMARY KEY,
    CRN INTEGER NOT NULL,
    Term TEXT NOT NULL,
    Section VARCHAR NOT NULL,
    Cap INTEGER,
    Act INTEGER,
    Rem INTEGER,
    CatID VARCHAR,
    InID INTEGER,
    FOREIGN KEY (CatID) REFERENCES COURSE(CatID),
    FOREIGN KEY (InID) REFERENCES INSTRUCTOR(InID)
);

--Creating table COURSE_MEETING with CatID as the Primary Key-- 

CREATE TABLE 'COURSE_MEETING' (
    MID INTEGER NOT NULL PRIMARY KEY,
    CRN INTEGER NOT NULL,
    Term TEXT NOT NULL,
    Start TEXT NOT NULL,
    Location VARCHAR NOT NULL,
    Day TEXT NOT NULL,
    End TEXT NOT NULL,
    FOREIGN KEY (CRN) REFERENCES SECTION(CRN),
    FOREIGN KEY (Term) REFERENCES SECTION(Term)
);

 * sqlite:///CourseData2.db
Done.
Done.
Done.
Done.


[]

**Creating Dataframes for the import course**

In [9]:
## populating import_course table
fall = ['SourceData/Fall2014/courses.csv','SourceData/Fall2015/courses.csv', 'SourceData/Fall2016/courses.csv',
        'SourceData/Fall2017/courses.csv','SourceData/Fall2018/courses.csv']
dff0 = pd.read_csv(fall[0])
dff1 = pd.read_csv(fall[1])
dff2 = pd.read_csv(fall[2])    
dff3 = pd.read_csv(fall[3])
dff4 = pd.read_csv(fall[4])

In [10]:
df_fall = pd.concat([dff0, dff1, dff2, dff3, dff4])
df_fall.head()

Unnamed: 0,term,crn,catalog_id,section,credits,title,meetings,timecodes,primary_instructor,cap,act,rem
0,Fall2014,70384,AC 0011,C01,3.0,Introduction to Financial Accounting,"[{'days': 'TF', 'times': '0800am-0915am', 'dat...",['TF 0800am-0915am 09/02-12/08 DSB 105'],Michael P. Coyne,0,31,-31
1,Fall2014,70385,AC 0011,C02,3.0,Introduction to Financial Accounting,"[{'days': 'TF', 'times': '0930am-1045am', 'dat...",['TF 0930am-1045am 09/02-12/08 DSB 105'],Michael P. Coyne,0,31,-31
2,Fall2014,70382,AC 0011,C03,3.0,Introduction to Financial Accounting,"[{'days': 'TF', 'times': '1230pm-0145pm', 'dat...",['TF 1230pm-0145pm 09/02-12/08 DSB 105'],Michael P. Coyne,0,31,-31
3,Fall2014,70291,AC 0011,C04,3.0,Introduction to Financial Accounting,"[{'days': 'MR', 'times': '1100am-1215pm', 'dat...",['MR 1100am-1215pm 09/02-12/08 DSB 111'],Rebecca I. Bloch,0,29,-29
4,Fall2014,70350,AC 0011,C05,3.0,Introduction to Financial Accounting,"[{'days': 'MR', 'times': '1230pm-0145pm', 'dat...",['MR 1230pm-0145pm 09/02-12/08 DSB 111'],Rebecca I. Bloch,0,30,-30


In [11]:
df_fall.describe()

Unnamed: 0,crn,cap,act,rem
count,7486.0,7486.0,7486.0,7486.0
mean,74523.769169,18.39487,16.310446,2.219076
std,2512.545835,14.41373,9.863494,17.175757
min,70001.0,0.0,-1.0,-75.0
25%,72213.0,12.0,8.0,0.0
50%,74848.5,19.0,16.0,1.0
75%,76698.75,25.0,24.0,5.0
max,79215.0,500.0,90.0,999.0


In [12]:
spring = ['SourceData/Spring2015/courses.csv','SourceData/Spring2016/courses.csv', 'SourceData/Spring2017/courses.csv',
        'SourceData/Spring2018/courses.csv','SourceData/Spring2019/courses.csv', 'SourceData/SpringBreak2017/courses.csv']
dfs0 = pd.read_csv(spring[0])
dfs1 = pd.read_csv(spring[1])
dfs2 = pd.read_csv(spring[2])    
dfs3 = pd.read_csv(spring[3])
dfs4 = pd.read_csv(spring[4])
dfs5 = pd.read_csv(spring[5])

In [13]:
df_spring = pd.concat([dfs0,dfs1,dfs2,dfs3,dfs4,dfs5])
df_spring.head()

Unnamed: 0,term,crn,catalog_id,section,credits,title,meetings,timecodes,primary_instructor,cap,act,rem
0,Spring2015,32436,AC 0011,A,3.0,Introduction to Financial Accounting,"[{'days': 'MR', 'times': '0200pm-0315pm', 'dat...",['MR 0200pm-0315pm 01/20-04/30 DSB 108'],Dawn W Massey,28,28,0
1,Spring2015,33880,AC 0011,B,3.0,Introduction to Financial Accounting,"[{'days': 'MR', 'times': '0930am-1045am', 'dat...",['MR 0930am-1045am 01/20-04/30 DSB 106'],Dawn W Massey,24,26,-2
2,Spring2015,33012,AC 0012,A,3.0,Introduction to Management Accounting,"[{'days': 'MR', 'times': '1100am-1215pm', 'dat...",['MR 1100am-1215pm 01/20-04/30 DSB 110A'],Rebecca I. Bloch,28,30,-2
3,Spring2015,33013,AC 0012,B,3.0,Introduction to Management Accounting,"[{'days': 'MR', 'times': '1230pm-0145pm', 'dat...",['MR 1230pm-0145pm 01/20-04/30 DSB 104'],Rebecca I. Bloch,28,28,0
4,Spring2015,33014,AC 0012,C,3.0,Introduction to Management Accounting,"[{'days': 'MR', 'times': '0330pm-0445pm', 'dat...",['MR 0330pm-0445pm 01/20-04/30 DSB 104'],Rebecca I. Bloch,28,28,0


In [15]:
df_spring.describe()

Unnamed: 0,crn,cap,act,rem
count,7122.0,7122.0,7122.0,7122.0
mean,35212.001966,18.123982,15.658804,2.465178
std,2618.670948,10.87664,10.342885,6.4971
min,31001.0,0.0,0.0,-47.0
25%,32773.25,11.0,7.0,0.0
50%,34835.0,19.0,16.0,1.0
75%,37693.75,25.0,24.0,4.0
max,41702.0,100.0,90.0,89.0


In [16]:
summer = ['SourceData/Summer2015/courses.csv', 'SourceData/Summer2016/courses.csv',
        'SourceData/Summer2017/courses.csv','SourceData/Summer2018/courses.csv']
dfsm0 = pd.read_csv(summer[0])
dfsm1 = pd.read_csv(summer[1])
dfsm2 = pd.read_csv(summer[2])    
dfsm3 = pd.read_csv(summer[3])

In [17]:
winter = ['SourceData/Winter2015/courses.csv', 'SourceData/Winter2016/courses.csv',
        'SourceData/Winter2017/courses.csv','SourceData/Winter2018/courses.csv']
dfw0 = pd.read_csv(winter[0])
dfw1 = pd.read_csv(winter[1])
dfw2 = pd.read_csv(winter[2])    
dfw3 = pd.read_csv(winter[3])

In [18]:
meeting_test = ['SourceData/Fall2014/course_meetings.csv']
df_test = pd.read_csv(meeting_test[0])

In [19]:
df_test.head()

Unnamed: 0,term,crn,location,day,start,end
0,Fall2014,70384,DSB 105,T,2014-09-02T08:00:00,2014-09-02T09:15:00
1,Fall2014,70384,DSB 105,F,2014-09-05T08:00:00,2014-09-05T09:15:00
2,Fall2014,70384,DSB 105,T,2014-09-09T08:00:00,2014-09-09T09:15:00
3,Fall2014,70384,DSB 105,F,2014-09-12T08:00:00,2014-09-12T09:15:00
4,Fall2014,70384,DSB 105,T,2014-09-16T08:00:00,2014-09-16T09:15:00
