In [1]:
# Dependencies
import os
import re
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import types
from random import randrange, uniform


In [2]:
# connect to database
engine = create_engine('sqlite:///gradebook.db', echo=False)

### Test process of preparing data for db import

In [3]:
# view data file
df = pd.read_csv("./gradebook_csv/cohort.csv")
df.head()

Unnamed: 0,cohort_id,cohort_name,location,start_date,end_date
0,1,DS_Spring2021,Minnesota,2/5/2021,8/5/2021
1,2,DS_Fall2021,Northwestern,8/20/2021,2/20/2022
2,3,DS_Spring2022,Northwestern,03/05/2022,09/05/2022


In [4]:
# check data types
df.dtypes

cohort_id       int64
cohort_name    object
location       object
start_date     object
end_date       object
dtype: object

In [5]:
# parse dates on import
df = pd.read_csv("./gradebook_csv/cohort.csv", parse_dates=['start_date', 'end_date'])
df.head()

Unnamed: 0,cohort_id,cohort_name,location,start_date,end_date
0,1,DS_Spring2021,Minnesota,2021-02-05,2021-08-05
1,2,DS_Fall2021,Northwestern,2021-08-20,2022-02-20
2,3,DS_Spring2022,Northwestern,2022-03-05,2022-09-05


In [6]:
# check datatypes
df.dtypes

cohort_id               int64
cohort_name            object
location               object
start_date     datetime64[ns]
end_date       datetime64[ns]
dtype: object

In [7]:
# detect data files in folder
file_list = os.listdir("./gradebook_csv")
file_list

['.ipynb_checkpoints',
 'attendance.csv',
 'cohort.csv',
 'email.csv',
 'feedback.csv',
 'github.csv',
 'grade.csv',
 'gradebook - cohort.csv',
 'gradebook - email.csv',
 'gradebook - github.csv',
 'gradebook - person.csv',
 'gradebook - student.csv',
 'person.csv',
 'session.csv',
 'student.csv',
 'submission.csv',
 'submission2.csv',
 'survey.csv',
 'surveyset.csv',
 'unit.csv',
 'unit2.csv',
 'week.csv']

In [8]:
# can I automate the laoding of the data?
# create search criteria for getting file without including the 'gradebook' files
seek = re.compile(r'^gradebook')

In [9]:
# list comprehension to get filtered list
files = [i for i in file_list if not seek.match(i)]
files

['.ipynb_checkpoints',
 'attendance.csv',
 'cohort.csv',
 'email.csv',
 'feedback.csv',
 'github.csv',
 'grade.csv',
 'person.csv',
 'session.csv',
 'student.csv',
 'submission.csv',
 'submission2.csv',
 'survey.csv',
 'surveyset.csv',
 'unit.csv',
 'unit2.csv',
 'week.csv']

#### Not going to do the automated updated since this should be a very rare occasion to run this file

# Add data to tables and view output
Due to foreign keys, tables must be added in specific order
(I) stands for independent table (no foreign keys)

In [10]:
# AttendanceStatus (I)
# create a table for binning in queries; used in attendance queries
df = pd.DataFrame({"label": ["Unsubmitted", "Very Early", "Early", "On-time", "Late", "Very Late" ] , "bin":[ "", -21, -4, -2, 2, 9] })
df.to_sql('attendance_status', con=engine, if_exists='replace')
df.head()

Unnamed: 0,label,bin
0,Unsubmitted,
1,Very Early,-21.0
2,Early,-4.0
3,On-time,-2.0
4,Late,2.0


In [11]:
# Cohort (I)
df = pd.read_csv("./gradebook_csv/cohort.csv", parse_dates=['start_date', 'end_date'])
df.to_sql('cohort', con=engine, if_exists='replace', dtype={'start_date': types.Date(), 'end_date': types.Date()})
df.head()

Unnamed: 0,cohort_id,cohort_name,location,start_date,end_date
0,1,DS_Spring2021,Minnesota,2021-02-05,2021-08-05
1,2,DS_Fall2021,Northwestern,2021-08-20,2022-02-20
2,3,DS_Spring2022,Northwestern,2022-03-05,2022-09-05


In [12]:
# Person (I)
df = pd.read_csv("./gradebook_csv/person.csv")
df.to_sql('person', con=engine, if_exists='replace')
df.head()

Unnamed: 0,person_id,name,role
0,1,Pia Emery,student
1,2,Adriana Kent,student
2,3,Jareth Rios,student
3,4,Agnes Tomlinson,student
4,5,Ansh Melton,student


In [13]:
# Email 
df = pd.read_csv("./gradebook_csv/email.csv", parse_dates=['date_added'])
df.to_sql('email', con=engine, if_exists='replace', dtype={'date_added': types.Date()})
df.head()

Unnamed: 0,email_id,person_id,email,date_added
0,1,1,curt83@marquardt.com,2021-03-05
1,2,2,frunolfsson@hotmail.com,2021-03-05
2,3,3,hgrant@yahoo.com,2021-03-05
3,4,4,kihn.roslyn@hotmail.com,2021-03-05
4,5,5,gertrude.brekke@hotmail.com,2021-03-05


In [14]:
# Github
df = pd.read_csv("./gradebook_csv/github.csv")
df.to_sql('github', con=engine, if_exists='replace')
df.head()

Unnamed: 0,person_id,github_username
0,1,curt8
1,2,fruno
2,3,hgran
3,4,kihn.
4,5,gertr


In [15]:
# Student
df = pd.read_csv("./gradebook_csv/student.csv")
df.to_sql('student', con=engine, if_exists='replace')
df.head()

Unnamed: 0,student_id,person_id,cohort_id
0,1,1,1
1,2,2,1
2,3,3,1
3,4,4,1
4,5,5,1


In [16]:
# Unit 
# Using Unit2 file after some corrections
df2 = pd.read_csv("./gradebook_csv/unit2.csv", parse_dates=['unit_start', 'unit_due'])
df2.to_sql('unit', con=engine, if_exists='replace', dtype={'unit_start': types.Date(), 'unit_start': types.Date()})
df2.head()

Unnamed: 0,unit_id,cohort_id,unit_number,unit_name,unit_start,unit_due,hw_submissions,students_enrolled,unit_required,context_code,career_assignment_bool,submitted_assignments,graded_assignments
0,211085,1,1,0: Data Prework,2021-02-03 06:00:00,2021-02-04 05:59:59,12.0,28.0,False,academic,False,hold,12.0
1,212070,1,2,Intro to Career Services: Employer-Ready vs Em...,2021-02-03 06:00:00,2021-02-11 05:59:59,0.0,28.0,False,career,True,hold,0.0
2,213030,1,3,1. Excel Homework,2021-02-02 06:00:00,2021-02-14 05:59:59,23.0,28.0,True,academic,False,hold,23.0
3,214096,1,4,2. Visual Basic Homework,2021-02-09 06:00:00,2021-02-21 04:59:59,23.0,26.0,True,academic,False,hold,23.0
4,215023,1,5,3. Python Homework,2021-02-16 06:00:00,2021-02-28 05:59:59,23.0,26.0,True,academic,False,hold,23.0


In [17]:
# Session
df = pd.read_csv("./gradebook_csv/session.csv", parse_dates=['session_start'])
df.to_sql('session', con=engine, if_exists='replace', dtype={'session_start': types.Date()})
df.head()

Unnamed: 0,cohort_id,session_id,session_name,unit_id,session_number,session_chapter,session_start,zoom_url
0,3,2671305,Zen of Data,1,1,1.1,2022-03-02,https://zoom.us/rec/share/mu17gMzpQnEpq7WOl4CSe01
1,3,2671307,Data Fundamentals in Excel,1,2,1.2,2022-03-04,https://zoom.us/rec/share/mu17gMzpQnEpq7WOl4CSe02
2,3,2671301,Charting a New Course With Excel,1,3,1.3,2022-03-05,https://zoom.us/rec/share/mu17gMzpQnEpq7WOl4CSe03
3,3,2671303,Fundamentals of Programming Using Visual Basic...,2,1,2.1,2022-03-09,https://zoom.us/rec/share/mu17gMzpQnEpq7WOl4CSe04
4,3,2671297,Fundamentals of Programming Using Visual Basic...,2,2,2.2,2022-03-11,https://zoom.us/rec/share/mu17gMzpQnEpq7WOl4CSe05


In [18]:
# Survey (I)
df = pd.read_csv("./gradebook_csv/survey.csv", parse_dates=['date_added'])
df.to_sql('survey', con=engine, if_exists='replace', dtype={'date_added': types.Date()})
df.head()

Unnamed: 0,question_id,question_text,date_added
0,1,How would you rate your overall satisfaction w...,2020-03-03
1,2,How would you rate the pace of class? (1 - too...,2020-03-03
2,3,How satisfied were you with the level of acade...,2020-03-03
3,4,Do you think you are prepared to apply what yo...,2020-03-03
4,5,How engaging was your instructor this past wee...,2020-03-03


In [19]:
# SurveySet
df = pd.read_csv("./gradebook_csv/surveyset.csv")
df.to_sql('surveyset', con=engine, if_exists='replace')
df.head()

Unnamed: 0,feedback_id,question_id,question_number
0,1,1,1
1,1,2,2
2,1,3,3
3,1,4,4
4,1,5,5


In [20]:
# Week
df = pd.read_csv("./gradebook_csv/week.csv", parse_dates=['week_end_date'])
df.to_sql('week', con=engine, if_exists='replace', dtype={'week_end_date': types.Date()})
df.head()

Unnamed: 0,week_id,cohort_id,week_end_date,current_unit,previous_session
0,1,1,2021-02-11,1,3
1,2,1,2021-02-17,2,3
2,3,1,2021-02-23,3,3
3,4,1,2021-03-01,4,3
4,5,1,2021-03-07,5,3


In [21]:
# Feedback
df = pd.read_csv("./gradebook_csv/feedback.csv", parse_dates=['submission_date'])
df.to_sql('feedback', con=engine, if_exists='replace', dtype={'submission_date': types.Date()})
df.head()

Unnamed: 0,feedback_id,student_id,week,submission_date,overall_satisfaction,academic_support,outside_class_productivity,pace,instructor_engagement,instructor_clarity,instructor_knowledge,homework_feedback,outside_class_time_spent,class_comments,instructional_support_comments
0,1,29,1,2021-08-22,5,3,5,5,5,5,5,0,1,,
1,1,30,1,2021-08-22,5,3,5,4,5,5,5,0,1,,
2,1,31,1,2021-08-23,4,2,5,5,5,5,5,0,1,,
3,1,32,1,2021-08-23,5,3,5,2,5,5,5,3,1,,
4,1,33,1,2021-08-23,4,5,4,4,5,5,4,3,2,,


In [22]:
# Attendance
df = pd.read_csv("./gradebook_csv/attendance.csv")
df=df[['session_id', 'student_id', 'present', 'pending']]
df.to_sql('attendance', con=engine, if_exists='replace')
df.head()

Unnamed: 0,session_id,student_id,present,pending
0,2571297,30,early,
1,2571297,31,early,
2,2571297,32,early,
3,2571297,33,early,
4,2571297,34,intime,


In [23]:
# Grade
df = pd.read_csv("./gradebook_csv/grade.csv")
df = df[['unit_id', 'student_id', 'hw_submitted', 'hw_grade']]
df.to_sql('grade', con=engine, if_exists='replace')
df.head()

Unnamed: 0,unit_id,student_id,hw_submitted,hw_grade
0,211085,1,True,A+
1,212070,1,True,A+
2,213030,1,True,A+
3,214096,1,True,A+
4,215023,1,True,A+


In [24]:
# # Arrival
# df = pd.read_csv("./gradebook_csv/arrival.csv")
# df.head()

In [25]:
# Submissions
df3 = pd.read_csv("./gradebook_csv/submission2.csv", parse_dates=['submission_date'])
df3.drop(labels=['email'], axis='columns', inplace=True )
df3.to_sql('submission', con=engine, if_exists='replace', dtype={'submission_date': types.Date()})
df3.head()

Unnamed: 0,unit_id,student_id,first_name,last_name,submission_status,submission_date,submission_notes,grade,feedback,plagiarism
0,211085,1,Pia,Emery,True,2021-02-04 05:59:59,,A+,Good Job.,NO
1,212070,1,Pia,Emery,True,2021-02-11 05:59:59,,A+,Everything is complete,NO
2,213030,1,Pia,Emery,True,2021-02-14 05:59:59,,A+,Everything is complete,NO
3,214096,1,Pia,Emery,True,2021-02-21 04:59:59,,A+,Good Job.,NO
4,215023,1,Pia,Emery,True,2021-02-28 05:59:59,,A+,Everything is complete,NO


In [26]:
engine.dispose()

# Add DF to SQLite

In [30]:
# Note:  The commands for updating the database were added to each cell above.  Below is the original test case
# Data types:  https://stackoverflow.com/questions/34383000/pandas-to-sql-all-columns-as-nvarchar 

# df.to_sql('email', con=engine, if_exists='append', dtype={'date': types.Date()})
# engine.dispose()
# Need to update date to datetime on some of the entries