## Data Extraction

###### Python Initialization

In [17]:
import os
import math

import pandas as pd
import numpy as np

from pyspark import SparkFiles
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

###### Import CSV from URL into Spark Dataframe

In [18]:
# Download study data
url      = 'https://github.com/dlhinkley/c772-capstone-project/raw/master/data/assessment_items.csv'
sc.addFile(url)

# Read CSV into Spark Dataframe
filename = os.path.basename(url)
file     = "file://" + SparkFiles.get(filename)
rawDf    = spark.read.csv(file, header='true', inferSchema='true')

###### Imported Schema

In [19]:
rawDf.printSchema()

root
 |-- ced_assignment_type_code: string (nullable = true)
 |-- is_affecting_grade: boolean (nullable = true)
 |-- number_of_learners: integer (nullable = true)
 |-- number_of_distinct_instance_items: integer (nullable = true)
 |-- assignment_max_attempts: integer (nullable = true)
 |-- assignment_late_submission: boolean (nullable = true)
 |-- assignment_final_submission_date: string (nullable = true)
 |-- assignment_start_date: string (nullable = true)
 |-- assignment_due_date: string (nullable = true)
 |-- min_student_start_datetime: string (nullable = true)
 |-- max_student_stop_datetime: string (nullable = true)
 |-- assignment_attempt_number: integer (nullable = true)
 |-- was_fully_scored_datetime: string (nullable = true)
 |-- was_submitted_datetime_actual: string (nullable = true)
 |-- was_in_progress_datetime: string (nullable = true)
 |-- is_force_scored: boolean (nullable = true)
 |-- is_manual_scoring_required: boolean (nullable = true)
 |-- student_start_datetime: strin

- Datetime variables are string type instead of timestamp

###### Convert datetime variable types to timestamp

In [20]:
intervalVars = [
    "assignment_due_date",
    "assignment_final_submission_date",
    "assignment_start_date",
    "item_attempt_end_datetime_utc",
    "item_attempt_start_datetime_utc",
    "max_student_stop_datetime",
    "min_student_start_datetime",
    "scored_datetime",
    "student_start_datetime",
    "student_stop_datetime",
    "was_fully_scored_datetime",
    "was_in_progress_datetime",
    "was_submitted_datetime_actual"
]
# Change Date Fields from String to Timestamp Type
for f in intervalVars:
    rawDf = rawDf.withColumn(f, F.col(f).cast(T.TimestampType()))


###### Descriptive Statistics of Continous and Nominal Variables

In [21]:
# Break the statistics into multiple rows of four instead of one wide row
#
desc = rawDf.describe()
cols = desc.columns                                 # Get list of vars
cols.remove('summary')                              # Remove summary
cols = [item for item in cols if '_id' not in item] # Remove id vars
cols.sort()                                         # Sort
num = math.ceil(len(cols) / 4)                      # Number of groups of 4
for c in np.array_split(cols, num):                 # Loop over groups
    desc.select("summary", *c).show(10,False)

+-------+--------------------+-------------------------+-----------------------+------------------------+
|summary|assigned_item_status|assignment_attempt_number|assignment_max_attempts|ced_assignment_type_code|
+-------+--------------------+-------------------------+-----------------------+------------------------+
|count  |149807              |149807                   |149807                 |149807                  |
|mean   |null                |0.9895465498941972       |0.9895465498941972     |null                    |
|stddev |null                |0.10170665925672466      |0.10170665925672466    |null                    |
|min    |assigned            |0                        |0                      |assessment              |
|max    |scored              |1                        |1                      |rubric                  |
+-------+--------------------+-------------------------+-----------------------+------------------------+

+-------+----------------------+-------------

- Data looks fine

###### Display Statistics for Interval Variables

In [22]:
# Display Pandas dataframe of datetime statistics

# Save min values in list
min = rawDf.agg(
    *(F.min(F.col(c).cast(T.DateType())).alias(c) for c in intervalVars)
).collect()[0]

# Save max values in list
max = rawDf.agg(
    *(F.max(F.col(c).cast(T.DateType())).alias(c) for c in intervalVars)
).collect()[0]

# Display in pandas dataframe
pd.DataFrame({
    "min": min,
    "max": max,
}, index=intervalVars)

Unnamed: 0,min,max
assignment_due_date,2019-08-23,2999-01-01
assignment_final_submission_date,2019-08-23,2999-01-01
assignment_start_date,1900-01-01,2020-05-22
item_attempt_end_datetime_utc,2019-08-21,2999-01-01
item_attempt_start_datetime_utc,1900-01-01,2020-05-26
max_student_stop_datetime,2019-08-22,2999-01-01
min_student_start_datetime,1900-01-01,2020-05-22
scored_datetime,2019-08-21,2998-12-31
student_start_datetime,1900-01-01,2020-05-26
student_stop_datetime,2019-08-21,2999-01-01


- Datetime variables have years of 1900, 2999 and 2998
- These are defaults used when no date exists
- Change to NULL

###### Change Default Dates to Null

In [23]:
# Set default date values to null (years 2999 and 1900)
# Set empty dates to null
for f in intervalVars:
    # Change to empty if date is more than 30 months in past or future
    rawDf = rawDf.withColumn(f,
         F.when(
             F.abs( F.months_between( F.col(f) , F.current_timestamp())) > 30,
             None
         ).otherwise(F.col(f))
      )

###### Keep Only Answered And Fully Scored
- Study is focused on fully scored completed questions
- Filter out other observations

In [24]:
# Only keep fully scored (learner_attempt_status = 'fully scored')
# and answered questions (assessment_item_response_id NOT NULL)
originalCnt = rawDf.count()
rawDf = rawDf.filter(
(F.col('assessment_item_response_id').isNull() == False)
& (F.col('learner_attempt_status') == 'fully scored')
)

###### Number of observations

In [25]:
filterCnt = rawDf.count()
print ("Original Observations:", originalCnt)
print ("Remaining Observations:", filterCnt)
print ("Remaining Percent:", round(filterCnt / originalCnt * 100), "%")

Original Observations: 149807
Remaining Observations: 64838
Remaining Percent: 43 %


###### Add duration variables

In [26]:
# Add the duration between start and stop of the attempt and each item
rawDf = rawDf.withColumn(
    'item_attempt_duration_mins',
    (F.col('item_attempt_end_datetime_utc').cast('long')
     - F.col('item_attempt_start_datetime_utc').cast('long')) / 60
  ).withColumn(
    'student_duration_mins',
    (F.col('student_stop_datetime').cast('long')
     - F.col('student_start_datetime').cast('long')) / 60
  ).withColumn(
    'timeliness_duration_mins',
    (F.col('assignment_due_date').cast('long')
     - F.col('student_start_datetime').cast('long')) / 60
  )

###### Create dataset of only study variables

In [27]:
studyVars = [
    'item_attempt_duration_mins',
    'student_duration_mins',
    'timeliness_duration_mins',
    'item_type_code_name',
    'number_of_distinct_instance_items',
    'raw_score',
    # Above vars derived from these
    # Include for investigation
    'assignment_due_date',
    'student_start_datetime',
    'student_stop_datetime',
    'item_attempt_start_datetime_utc',
    'item_attempt_end_datetime_utc',
]
studyDf = rawDf.select(*studyVars)

###### Number of complete observations

In [28]:
missingCnt = studyDf.subtract(studyDf.dropna()).count()

print ("Number Complete:", filterCnt - missingCnt)
print ("Percent Complete:", round( (filterCnt - missingCnt) / filterCnt * 100), "%")

Number Complete: 62827
Percent Complete: 97 %


In [29]:
# Save Dataframe to file for reuse
studyDf.repartition(1).write.mode('overwrite').parquet(".data/studyDf.parquet")