## Cleaning

In [23]:
%%capture

%run './lib/init.ipynb'
from lib import utilities as util
from pyspark.sql import functions as F
import pandas as pd

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

sc = SparkContext.getOrCreate()
spark = SparkSession( sc )

In [24]:
filterDf  = util.load_df('filterDf')

studyVars = [
    'item_type_code_name',
    'number_of_distinct_instance_items',
    'item_attempt_duration_mins',
    'student_duration_mins',
    'timeliness_duration_mins'
]
cleaningVars = [
    'raw_score',
    'item_attempt_end_datetime_utc',
    'item_attempt_start_datetime_utc',
    'assessment_item_response_id'
]

###### Create dataset of only study variables

In [25]:
# Only records before Feb 1 2020
cleanDf = filterDf.filter( F.col('student_start_datetime') < F.lit('2020-02-01') ).select(*studyVars, *cleaningVars)
beforeCnt = cleanDf.count()
beforeCnt

43219

###### Cleaning Tasks

In [26]:
util.list_todo(False)

{'todo': 'Remove null values in response_correctness', 'finished': False}


##### Before Training Validation Split

In [27]:
stepsDf = pd.DataFrame(columns = ['description', 'removed', 'imputed', 'data', 'function'])

###### Remove [unassigned] in response_correctness

In [28]:
# Clean
cleanDf = util.remove_unassigned_response_correctness(cleanDf)

afterCnt = cleanDf.count()
afterCnt

42303

In [29]:
# Update status
def update_status(df, description, function, removed, imputed, data = 'all'):
    util.finish_todo(description)
    return df.append({'description' : description, 'removed' : removed, 'imputed' : imputed, 'function' : function, 'data': data }, ignore_index = True)


stepsDf = update_status(
    stepsDf,
    "Remove [unassigned] in response_correctness",
    'remove_unassigned_response_correctness',
    beforeCnt - afterCnt,
    0
)
beforeCnt = afterCnt

Finished: Remove [unassigned] in response_correctness


###### Remove student start or stop null dates

In [30]:
# Clean
cleanDf = util.remove_null_student_dates(cleanDf)

afterCnt = cleanDf.count()
afterCnt

42303

In [31]:
# Update status

stepsDf = update_status(
    stepsDf,
    "Remove student start or stop null dates",
    'remove_null_student_dates',
    beforeCnt - afterCnt,
    0
)
beforeCnt = afterCnt

Finished: Remove student start or stop null dates


###### Remove attempt stop dates before start date

In [32]:
# Clean
cleanDf = util.remove_attempt_stop_dates_before_start_dates(cleanDf)

afterCnt = cleanDf.count()
afterCnt

42295

In [33]:
# Update status

stepsDf = update_status(
    stepsDf,
    "Remove attempt stop dates before start date",
    'remove_attempt_stop_dates_before_start_dates',
    beforeCnt - afterCnt,
    0
)
beforeCnt = afterCnt

Finished: Remove attempt stop dates before start date


###### Create target variable from raw_score

In [34]:
cleanDf = util.create_target_var_from_raw_score(cleanDf)

##### Create training and validation datasets

In [35]:
covidTrnDf, covidValDf = cleanDf.select('target', *studyVars).randomSplit([0.75, 0.25], 11112020)

##### Training count

In [36]:
covidTrnDf.count()

31847

##### Validation count

In [37]:
covidValDf.count()

10448

###### Imputation function

In [38]:
def post_split_imputation(daDf, stDf, data):
    daDf = util.add_swoe(daDf, 'target', 'item_type_code_name', 24)
    stDf = update_status(
        stDf,
        "Reduce number of levels in item_type_code_name",
        'add_swoe',
        0,
        0,
        data
    )

    daDf = util.impute_student_duration(daDf)
    stDf = update_status(
        stDf,
        "Impute max durations outliers in student attempt times",
        'impute_student_duration',
        0,
        daDf.filter( F.col('student_duration_mins_imputed') ).count(),
        data
    )

    daDf = util.impute_timeliness_duration(daDf)
    stDf = update_status(
        stDf,
        "Impute timeliness durations min and max outliers",
        'impute_timeliness_duration',
        0,
        daDf.filter( F.col('timeliness_duration_mins_imputed') ).count(),
        data
    )

    daDf = util.impute_item_attempt_duration(daDf)
    stDf = update_status(
        stDf,
        "Impute max durations outliers in item attempt times",
        'impute_item_attempt_duration',
        0,
        daDf.filter( F.col('item_attempt_duration_mins_imputed') ).count(),
        data
    )


    return daDf, stDf

###### Impute Training Data

In [39]:
covidTrnDf, stepsDf = post_split_imputation(covidTrnDf, stepsDf, 'train')

Finished: Reduce number of levels in item_type_code_name
Finished: Impute max durations outliers in student attempt times
Finished: Impute timeliness durations min and max outliers
Finished: Impute max durations outliers in item attempt times


###### Impute Validation Data

In [40]:
covidValDf, stepsDf = post_split_imputation(covidValDf, stepsDf, 'valid')

Finished: Reduce number of levels in item_type_code_name
Finished: Impute max durations outliers in student attempt times
Finished: Impute timeliness durations min and max outliers
Finished: Impute max durations outliers in item attempt times


###### Imputation Counts

In [41]:
print ("Training:")
total = covidTrnDf.count()
print ("  total:", total)
cnt = covidTrnDf.filter( F.col('student_duration_mins_imputed')
                | F.col('timeliness_duration_mins_imputed')
                | F.col('item_attempt_duration_mins_imputed')
                ).count()
print ("  observations:", cnt, round(cnt/total*100), '%')

cnt = covidTrnDf.filter( F.col('student_duration_mins_imputed')
                & F.col('timeliness_duration_mins_imputed')
                & F.col('item_attempt_duration_mins_imputed')
                ).count()
print ("  common:", cnt, round(cnt/total*100), '%')

print ("Validation:")
total = covidValDf.count()
print ("  total:", total)
cnt = covidValDf.filter( F.col('student_duration_mins_imputed')
                | F.col('timeliness_duration_mins_imputed')
                | F.col('item_attempt_duration_mins_imputed')
                ).count()
print ("  observations:", cnt, round(cnt/total*100), '%')

cnt = covidValDf.filter( F.col('student_duration_mins_imputed')
                & F.col('timeliness_duration_mins_imputed')
                & F.col('item_attempt_duration_mins_imputed')
                ).count()
print ("  common:", cnt, round(cnt/total*100), '%')

Training:
  total: 31847
  observations: 10187 32 %
  common: 192 1 %
Validation:
  total: 10448
  observations: 3305 32 %
  common: 11 0 %


###### Cleaning Steps

In [42]:
stepsDf

Unnamed: 0,description,removed,imputed,data,function
0,Remove [unassigned] in response_correctness,916,0,all,remove_unassigned_response_correctness
1,Remove student start or stop null dates,0,0,all,remove_null_student_dates
2,Remove attempt stop dates before start date,8,0,all,remove_attempt_stop_dates_before_start_dates
3,Reduce number of levels in item_type_code_name,0,0,train,add_swoe
4,Impute max durations outliers in student attem...,0,7335,train,impute_student_duration
5,Impute timeliness durations min and max outliers,0,2163,train,impute_timeliness_duration
6,Impute max durations outliers in item attempt ...,0,4582,train,impute_item_attempt_duration
7,Reduce number of levels in item_type_code_name,0,0,valid,add_swoe
8,Impute max durations outliers in student attem...,0,2472,valid,impute_student_duration
9,Impute timeliness durations min and max outliers,0,428,valid,impute_timeliness_duration


In [43]:
util.save_df(covidValDf, 'covidValDf')
util.save_df(covidTrnDf, 'covidTrnDf')
util.save_df(spark.createDataFrame(stepsDf), 'stepsDf')