# Importing Necessary Packages

In [1]:
# Needed for SQL Alchemy  
from sqlalchemy import create_engine
engine=create_engine('sqlite:///.CourseData',echo=False)

# All query results are returned as Pandas DataFrames 
import pandas as pd

#Used for importing CSVs
import glob
import os

# Needed for %sql Magic
%load_ext sql

import sqlite3

# Creating the Databases

In [2]:
#Create CourseData and CourseDqtaWarehouse Databases
conn = sqlite3.connect("./CourseData.db")
conn = sqlite3.connect("./CourseDataWarehouse.db")

# Reading in the CSVs
- Combined each CSV into one DataFrame dropping necessary files 

In [3]:
CourseCatalog2017_2018=pd.read_csv("./SourceData/Catalogs/CourseCatalog2017_2018.csv")
CourseCatalog2018_2019=pd.read_csv("./SourceData/Catalogs/CourseCatalog2018_2019.csv")

In [4]:
#Reading and combining all course meetings CSVs

#Get list of folders/files in Source Data folder
folders=glob.glob('./SourceData/*')

#Delete folders/files from list that don't contain course meeting CSV files
del folders[5]
del folders[12]
del folders[16]

#Read CSVs from folders in list and combine into a single dataframe
all_files=[]
for path in folders:
    all_files += glob.glob(os.path.join(path, "course_meetings.csv"))     # advisable to use os.path.join as this makes concatenation OS independent
df_from_each_file = (pd.read_csv(f) for f in all_files)
course_meetings = pd.concat(df_from_each_file, ignore_index=True, sort=True)

course_meetings.term.unique()


array(['Fall2015', 'Fall2014', 'Winter2015', 'Spring2015', 'Summer2015',
       'Fall2016', 'Fall2018', 'Fall2017', 'Spring2019',
       'SpringBreak2017', 'Spring2017', 'Winter2018', 'Winter2016',
       'Spring2016', 'Spring2018', 'Winter2017', 'Summer2017',
       'Summer2018', 'Summer2016'], dtype=object)

In [5]:
#Reading and combining all courses CSVs

#Get list of folders/files in Source Data folder
folders=glob.glob('./SourceData/*')

#Delete folders/files from list that don't contain courses CSV files
del folders[5]
del folders[12]
del folders[16]

#Read CSVs from folders in list and combine into a single dataframe
all_files=[]
for path in folders:
    all_files += glob.glob(os.path.join(path, "courses.csv"))     # advisable to use os.path.join as this makes concatenation OS independent
df_from_each_file = (pd.read_csv(f) for f in all_files)
courses = pd.concat(df_from_each_file, ignore_index=True, sort=True)

#courses.term.unique()



#Split Meetings column in Courses DataFrame into separate columns
courses=pd.merge(courses, courses['meetings'].str.split(',',expand=True), left_index=True, right_index=True)

# Course Catalog to SQL

In [6]:
CourseCatalog2017_2018.to_sql('CourseCatalog2017_2018',con=engine,if_exists='replace')
CourseCatalog2018_2019.to_sql('CourseCatalog2018_2019',con=engine,if_exists='replace')

# Course Meetings to SQL
- Used separate tables because DataFrame is too large to import all at once
- Sliced every 35,000 records in order to import

In [8]:
#import course_meetings DataFrame to SQL as separate tables because DataFrame is too large to import at once.
#Will use a union later in SQL to combine them

course_meetings[:35000].to_sql('course_meetings1',con=engine,if_exists='replace')
course_meetings[35000:70000].to_sql('course_meetings2',con=engine,if_exists='replace')
course_meetings[70000:105000].to_sql('course_meetings3',con=engine,if_exists='replace')
course_meetings[105000:140000].to_sql('course_meetings4',con=engine,if_exists='replace')
course_meetings[140000:175000].to_sql('course_meetings5',con=engine,if_exists='replace')
course_meetings[175000:210000].to_sql('course_meetings6',con=engine,if_exists='replace')
course_meetings[210000:245000].to_sql('course_meetings7',con=engine,if_exists='replace')
course_meetings[245000:280000].to_sql('course_meetings8',con=engine,if_exists='replace')
course_meetings[280000:315000].to_sql('course_meetings9',con=engine,if_exists='replace')
course_meetings[315000:].to_sql('course_meetings10',con=engine,if_exists='replace')

# Courses DataFrame to SQL
- Able to do once because the DataFrame is much smaller

In [9]:
courses.to_sql('courses',con=engine,if_exists='replace')

# Initialize the connection to Course Data Database

In [10]:
# Initialize 
%sql sqlite:///.CourseData

'Connected: @.CourseData'

# Create Combined Course Catalog Table

In [11]:
%%sql

Create table CourseCatalogCombined(
Year text,
program_code text,
catalog_id text,
course_title text,
credits text,
prereqs text,
coreqs text,
fees text,
attributes text,
description text
);

insert into CourseCatalogCombined
select *
from (
select '2017_2018' Year,program_code,catalog_id,course_title,credits,prereqs,coreqs,fees,attributes,description from CourseCatalog2017_2018
union all
select '2018_2019' Year,program_code,catalog_id,course_title,credits,prereqs,coreqs,fees,attributes,description from CourseCatalog2018_2019) a

 * sqlite:///.CourseData
(sqlite3.OperationalError) table CourseCatalogCombined already exists [SQL: 'Create table CourseCatalogCombined(\nYear text,\nprogram_code text,\ncatalog_id text,\ncourse_title text,\ncredits text,\nprereqs text,\ncoreqs text,\nfees text,\nattributes text,\ndescription text\n);'] (Background on this error at: http://sqlalche.me/e/e3q8)


# Create Overall Course Meetings Table
- Specified column datatypes

In [12]:
%%sql

CREATE TABLE all_course_meetings (
term text,
crn int,
location datexttatype,
day text,
start text,
end text
);

 * sqlite:///.CourseData
(sqlite3.OperationalError) table all_course_meetings already exists [SQL: 'CREATE TABLE all_course_meetings (\nterm text,\ncrn int,\nlocation datexttatype,\nday text,\nstart text,\nend text\n);'] (Background on this error at: http://sqlalche.me/e/e3q8)


# Inserted data into table
- Used union to combine previously separated tables into one table

In [72]:
#%%sql

INSERT INTO all_course_meetings
    select term,crn,location,day,start,end from course_meetings1
union all
    select term,crn,location,day,start,end from course_meetings2
union all
    select term,crn,location,day,start,end from course_meetings3
union all
    select term,crn,location,day,start,end from course_meetings4
union all
    select term,crn,location,day,start,end from course_meetings5
union all
    select term,crn,location,day,start,end from course_meetings6
union all
    select term,crn,location,day,start,end from course_meetings7
union all
    select term,crn,location,day,start,end from course_meetings8
union all
    select term,crn,location,day,start,end from course_meetings9
union all
    select term,crn,location,day,start,end from course_meetings10
;

 * sqlite:///.CourseData
(sqlite3.OperationalError) table all_course_meetings has 9 columns but 6 values were supplied [SQL: 'INSERT INTO all_course_meetings\n    select term,crn,location,day,start,end from course_meetings1\nunion all\n    select term,crn,location,day,start,end from course_meetings2\nunion all\n    select term,crn,location,day,start,end from course_meetings3\nunion all\n    select term,crn,location,day,start,end from course_meetings4\nunion all\n    select term,crn,location,day,start,end from course_meetings5\nunion all\n    select term,crn,location,day,start,end from course_meetings6\nunion all\n    select term,crn,location,day,start,end from course_meetings7\nunion all\n    select term,crn,location,day,start,end from course_meetings8\nunion all\n    select term,crn,location,day,start,end from course_meetings9\nunion all\n    select term,crn,location,day,start,end from course_meetings10\n;'] (Background on this error at: http://sqlalche.me/e/e3q8)


# Alter table to break Start and End columns into separate Date and Time columns

In [74]:
%%sql

alter table all_course_meetings add MeetingDate date;
update all_course_meetings set MeetingDate = substr(start,1,10);

alter table all_course_meetings add StartTime time;
update all_course_meetings set StartTime=substr(start,12);
alter table all_course_meetings add EndTime time;
update all_course_meetings set EndTime=substr(end,12);


 * sqlite:///.CourseData
Done.
317321 rows affected.
Done.
317321 rows affected.
Done.
317321 rows affected.


[]

# Create Courses Table

In [19]:
%%sql

CREATE TABLE courses_clean(
act BIGINT,
cap BIGINT,
catalog_id TEXT,
credits TEXT,
crn BIGINT,
meetings TEXT,
primary_instructor TEXT,
rem BIGINT,
section TEXT,
term TEXT,
title TEXT,
days TEXT,
times TEXT,
dates TEXT,
location TEXT
);

 * sqlite:///.CourseData
Done.


[]

# Cleaned Courses Table
- Broke courses with multiple meetings into separate records 
- Will allow for easier queries

In [20]:
%%sql
delete from courses_clean;

insert into courses_clean
SELECT * from
(
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[0] ,[1] ,[2] ,[3] 
    from courses
    where [4] is null 
union all
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[4] ,[5] ,[6] ,[7]
    from courses
    where [4] is not null and [8] is null
union all
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[8] ,[9] ,[10] ,[11]
    from courses
    where [8] is not null and [12] is null
union all
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[12] ,[13] ,[14] ,[15]
    from courses
    where [12] is not null and [16] is null
union all
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[16] ,[17] ,[18] ,[19]
    from courses
    where [16] is not null and [20] is null 
union all
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[20] ,[21] ,[22] ,[23]
    from courses
    where [20] is not null and [24] is null 
union all
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[24] ,[25] ,[26] ,[27]
    from courses
    where [24] is not null and [28] is null 
union all
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[28] ,[29] ,[30] ,[31]
    from courses
    where [28] is not null and [32] is null 
union all
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[32] ,[33] ,[34] ,[35]
    from courses
    where [32] is not null and [36] is null 
union all
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[36] ,[37] ,[38] ,[39]
    from courses
    where [36] is not null and [40] is null 
union all
    select act ,cap ,catalog_id ,credits ,crn ,meetings ,primary_instructor ,rem ,section ,term ,title ,[40] ,[41] ,[42] ,[43]
    from courses
    where [40] is not null 
    ) 

 * sqlite:///.CourseData
0 rows affected.
15937 rows affected.


[]

# Created New DataFrame
- Wanted to clean up data with Python because SQL does not allow for easy character replacement

In [22]:
courses_clean=pd.read_sql_table('courses_clean',con=engine)


for i in ['[','{',':','days',"'",' ']:
    courses_clean['days']=courses_clean['days'].str.replace(i,'')

for i in ['[','{',':','times',"'",' ']:
    courses_clean['times']=courses_clean['times'].str.replace(i,'')

for i in ['[','{',':','dates',"'",' ']:
    courses_clean['dates']=courses_clean['dates'].str.replace(i,'')

for i in ['[','{',':','location',"'",' ','}',']']:
    courses_clean['location']=courses_clean['location'].str.replace(i,'')
    
courses_clean = courses_clean.drop('meetings', 1)

courses_clean.head(5)

courses_clean.to_sql('courses_final',con=engine,if_exists='replace')

Checking if Timecodes column in courses table has any data that isn't included in the columns that we broke out from the original meetings column:

In [33]:
%%sql


select * from CourseCatalogCombined limit 2;




 * sqlite:///.CourseData
Done.


Year,program_code,catalog_id,course_title,credits,prereqs,coreqs,fees,attributes,description
2017_2018,AN,AN 0301,Independent Study,1-3 Credits,,,,,Students undertake an individualized program of study in consultation with a director from the Asian studies faculty.
2017_2018,AN,AN 0310,Asian Studies Seminar,3 Credits,,,,,"This seminar examines selected topics concerning Asia. This course is taught in conjunction with another 100-300 level course from a rotation of course offerings. Consult the Asian Studies director to identify the conjoined course for a given semester. The seminar concentrates on topics within the parameters of the conjoined course syllabus but adds research emphasis. Students registered for this course must complete a research project, to include 300-level research, in addition to the regular research requirements of the conjoined course, and a 25-50 page term paper in substitution of some portion of the conjoined course requirements, as determined by the instructor. Open to juniors and seniors only."


In [44]:
%%sql

select count(*)
from (
select distinct * from CourseCatalogCombined
) a

 * sqlite:///.CourseData
Done.


count(*)
4440


In [128]:
%%sql

select distinct act,cap,crn,rem,section
from courses_final limit 10;

select * from courses_final where section='F'

 * sqlite:///.CourseData
Done.


index,act,cap,catalog_id,credits,crn,primary_instructor,rem,section,term,title,days,times,dates,location
5,32,29,AC 0011,3.0,75242,Paul Caster,-3,F,Fall2015,Introduction to Financial Accounting,MR,0930am-1045am,09/01-12/07,DSB111
65,25,25,AE 0291,3.0,76827,Norma Schmidt,0,F,Fall2015,Business Ethics,W,1100am-0130pm,09/01-12/07,DSB110B
108,22,20,BI 0107L,0.0,76060,Christine Earls,-2,F,Fall2015,Human Anatomy & Physiology Lab,R,0100pm-0350pm,09/01-12/07,BNW240
131,11,12,BI 0170P,0.0,75681,TBA,1,F,Fall2015,General Biology I PLG,W,0400pm-0450pm,09/01-12/07,BNW319
179,28,25,BU 0211,3.0,77040,Carlos E. Pena,-3,F,Fall2015,Legal Environment of Business,MW,0500pm-0615pm,09/01-12/07,DSB108
206,12,16,CH 0111L,1.0,75583,Kenneth J. Borowski,4,F,Fall2015,General Chemistry I Lab,W,0500pm-0750pm,09/01-12/07,BNW361
407,18,19,EN 0011,3.0,75015,Rebecca Louise Dimyan,1,F,Fall2015,Texts and Contexts I: Writing as Craft and Inquiry,MR,0800am-0915am,09/01-12/07,CNS10
549,29,30,FI 0101,3.0,75298,Laurie G. Richardson,1,F,Fall2015,Introduction to Finance,TF,0930am-1045am,09/01-12/07,DSB104
589,22,27,FE 0001,0.0,75083,Kamala Kiem,5,F,Fall2015,First Year Experience,T,1100am-1215pm,09/01-12/07,MCA206
645,23,25,HI 0010,3.0,75497,Kristen N Keegan,2,F,Fall2015,Origins of the Modern World Since 1500,MR,1230pm-0145pm,09/01-12/07,CNS15


In [125]:
%%sql

select count(*)
from (
select distinct act,cap,crn,rem,section from courses_final
) a

 * sqlite:///.CourseData
Done.


count(*)
15936


In [75]:
%%sql

select * from all_course_meetings limit 10;

 * sqlite:///.CourseData
Done.


term,crn,location,day,start,end,MeetingDate,StartTime,EndTime
Fall2015,75222,DSB 112,R,2015-09-03T14:00:00,2015-09-03T15:15:00,2015-09-03,14:00:00,15:15:00
Fall2015,75222,DSB 112,M,2015-09-07T14:00:00,2015-09-07T15:15:00,2015-09-07,14:00:00,15:15:00
Fall2015,75222,DSB 112,R,2015-09-10T14:00:00,2015-09-10T15:15:00,2015-09-10,14:00:00,15:15:00
Fall2015,75222,DSB 112,M,2015-09-14T14:00:00,2015-09-14T15:15:00,2015-09-14,14:00:00,15:15:00
Fall2015,75222,DSB 112,R,2015-09-17T14:00:00,2015-09-17T15:15:00,2015-09-17,14:00:00,15:15:00
Fall2015,75222,DSB 112,M,2015-09-21T14:00:00,2015-09-21T15:15:00,2015-09-21,14:00:00,15:15:00
Fall2015,75222,DSB 112,R,2015-09-24T14:00:00,2015-09-24T15:15:00,2015-09-24,14:00:00,15:15:00
Fall2015,75222,DSB 112,M,2015-09-28T14:00:00,2015-09-28T15:15:00,2015-09-28,14:00:00,15:15:00
Fall2015,75222,DSB 112,R,2015-10-01T14:00:00,2015-10-01T15:15:00,2015-10-01,14:00:00,15:15:00
Fall2015,75222,DSB 112,M,2015-10-05T14:00:00,2015-10-05T15:15:00,2015-10-05,14:00:00,15:15:00
