## Cleaning

In [1]:
%run './lib/libraries.ipynb'

dython              0.6.1
tinydb              4.2.0


In [2]:
dfClean  = load_df('dfFlt')

###### Remove variables assignment_late_submission, is_manual_scoring_required and is_deleted
Do this early so they don't get in the way

In [None]:
dfClean = dfClean.drop('assignment_late_submission', 'is_manual_scoring_required', 'is_deleted')

In [None]:
finish_todo('Remove variables assignment_late_submission, is_manual_scoring_required and is_deleted')

###### Remove unary learner_attempt_status
Do this early so they don't get in the way

In [None]:
dfClean = dfClean.drop('learner_attempt_status')

In [None]:
finish_todo('Remove unary learner_attempt_status = "fully scored"')



###### Remove 1566 [unassigned] in response_correctness observations"

In [None]:
# Make sure to preserve the nulls
dfFlt = remove_1566_unassigned()

###### Remove unary variables assignment_attempt_number and assignment_max_attempts
After 1566 unassigned in response_correctness is address, assignment_attempt_number and assignment_max_attemps is unary

In [None]:
dfClean = dfClean.drop("assignment_attempt_number", "assignment_max_attempts")


In [None]:
finish_todo("Remove 1566 [unassigned] in response_correctness")
finish_todo('Remove assignment_attempt_number and assignment_max_attempts')


###### Remove response_correctness

In [None]:
dfClean = dfClean.drop('response_correctness')

In [None]:
finish_todo('Remove 3298 null values in response_correctness')


### Reclassify categories in item_type_code_name

In [10]:
# Before Categories
dfClean.select("item_type_code_name").distinct().orderBy("item_type_code_name").show(50, False)


Combine Suffix Levels
- The levels with the suffix Response (ex: FillinBlankResponse) is the same type of question as level without the suffix (ex: fillInTheBlank)

In [12]:
# Combine fillInTheBlank and FillinBlankResponse 
dfClean = dfClean.withColumn("item_type_code_name", when( col("item_type_code_name") == "FillinBlankResponse", "fillInTheBlank" ).otherwise(col("item_type_code_name")) )

# Combine multipleChoice and MultipleChoiceResponse 
dfClean = dfClean.withColumn("item_type_code_name", when( col("item_type_code_name") == "MultipleChoiceResponse", "multipleChoice" ).otherwise(col("item_type_code_name")) )


In [13]:
from pyspark.sql.functions import col, round

tot = dfClean.filter(col("item_type_code_name").isNull() == False).count()

freqTable = dfClean.groupBy("item_type_code_name") \
               .count() \
               .withColumnRenamed('count', 'cnt_per_group') \
               .withColumn('perc_of_count_total', ( col('cnt_per_group') / tot) * 100 ) \
               .orderBy("cnt_per_group", ascending=False)

freqTable.show(50, False)

We only want five levels, so convert everything below 6% to other

In [15]:
otherRows    = freqTable.filter("perc_of_count_total < 6")
otherLevels  = [row['item_type_code_name'] for row in otherRows.select("item_type_code_name").collect()]

dfClean = dfClean.withColumn("item_type_code_name", when( col("item_type_code_name").isin(otherLevels) | col("item_type_code_name").isNull() , "Other" ).otherwise(col("item_type_code_name")) )

# Display new values
dfClean.groupBy("item_type_code_name").count().orderBy("count", ascending=False).show(50, False)


In [16]:
dfClean.createOrReplaceTempView("clean_data")

In [17]:
dfDesc.select('field').show(40, False)

### Reclassify categories in item_type_code_name

In [19]:
# Before Categories
dfClean.select("item_type_code_name").distinct().orderBy("item_type_code_name").show(50, False)


Combine Suffix Levels
- The levels with the suffix Response (ex: FillinBlankResponse) is the same type of question as level without the suffix (ex: fillInTheBlank)

In [21]:
# Combine fillInTheBlank and FillinBlankResponse 
dfClean = dfClean.withColumn("item_type_code_name", when( col("item_type_code_name") == "FillinBlankResponse", "fillInTheBlank" ).otherwise(col("item_type_code_name")) )

# Combine multipleChoice and MultipleChoiceResponse 
dfClean = dfClean.withColumn("item_type_code_name", when( col("item_type_code_name") == "MultipleChoiceResponse", "multipleChoice" ).otherwise(col("item_type_code_name")) )


In [22]:
from pyspark.sql.functions import col, round

tot = dfClean.filter(col("item_type_code_name").isNull() == False).count()

freqTable = dfClean.groupBy("item_type_code_name") \
               .count() \
               .withColumnRenamed('count', 'cnt_per_group') \
               .withColumn('perc_of_count_total', ( col('cnt_per_group') / tot) * 100 ) \
               .orderBy("cnt_per_group", ascending=False)

freqTable.show(50, False)

We only want five levels, so convert everything below 6% to other

In [24]:
otherRows    = freqTable.filter("perc_of_count_total < 6")
otherLevels  = [row['item_type_code_name'] for row in otherRows.select("item_type_code_name").collect()]

dfClean = dfClean.withColumn("item_type_code_name", when( col("item_type_code_name").isin(otherLevels) | col("item_type_code_name").isNull() , "Other" ).otherwise(col("item_type_code_name")) )

# Display new values
dfClean.groupBy("item_type_code_name").count().orderBy("count", ascending=False).show(50, False)


In [25]:
dfClean.createOrReplaceTempView("clean_data")

##### Impute 749 t0 750 null dates

In [None]:
dfImpute = impute_749_to_750_null_dates(dfFlt)

In [None]:
date_boxplot(dfImpute, "Null Dates Fixed")

In [None]:
dual_mean_hours_assignment(
    get_random_sample(dfClean),
    dfImpute,
    title1 = 'Before',
    title2 = 'After',
    main = 'Before and After Imputation'
)

Before and After Values Match
- student_start_datetime & scored_datetime = 9
- student_stop_datetime  & scored_datetime = -12
- was_fully_scored_datetime & scored_datetime = -17

In [None]:
finish_todo('Impute 749 t0 750 null dates')

##### Impute 3422 null dates in scored_datetime

In [None]:

dfImpute = impute_3422_null_dates(dfFlt)

In [None]:
date_boxplot(dfImpute, "Null Dates Fixed")

In [None]:
dual_mean_hours_assignment(
    get_random_sample(dfClean),
    dfImpute,
    title1 = 'Before',
    title2 = 'After',
    main = 'Before and After Imputation'
)


Before and After Values Match
- student_stop_datetime  & scored_datetime = -12

In [None]:
finish_todo('Impute 3422 null dates in scored_datetime')

#### Impute 9965 null dates in was_in_progress_datetime


In [5]:

dfImpute = impute_9965_null_dates(dfClean)

In [None]:
date_boxplot(dfImpute, "Null Dates Fixed")

In [None]:
dual_mean_hours_assignment(
    get_random_sample(dfClean),
    dfImpute,
    title1 = 'Before',
    title2 = 'After',
    main = 'Before and After Imputation'
)

Before and After Values Match
- student_start_datetime  & was_in_progress_datetime = -12

In [None]:
finish_todo('Impute 9965 null dates in was_in_progress_datetime')


### Impute 4446 of 18469 null dates in was_submitted_datetime_actual


In [None]:
# Impute the 4446 null dates with the mean difference of was_submitted_datetime_actual and student_stop_datetime
def impute_4446_null_dates(df):

    # Get sample to extract means
    pdDf = df.select( 'scored_datetime', 'was_submitted_datetime_actual').toPandas()

    # Calculate mean difference in seconds
    meanDiff = ( (pdDf['scored_datetime'] - pdDf['was_submitted_datetime_actual'])     / np.timedelta64(1, 's') ).mean()


    return df.withColumn(
                "was_submitted_datetime_actual",
                F.when(
                    (F.col('was_submitted_datetime_actual').isNull()) & (F.col('final_score_unweighted') > 0),
                    (F.unix_timestamp("scored_datetime") - meanDiff).cast('timestamp')
                ).otherwise( F.col("was_submitted_datetime_actual") )
    )


dfImpute = impute_4446_null_dates(dfClean)

In [None]:
date_boxplot(dfImpute, "Null Dates Fixed")

In [None]:
dual_mean_hours_assignment(
    get_random_sample(dfClean),
    get_random_sample(dfImpute),
    title1 = 'Before',
    title2 = 'After',
    main = 'Before and After Imputation'
)

Values corrected
- The corrected values adjusted the mean values of was_submited_datetime_actual up ~1 to 3 hours for related variables


In [None]:
finish_todo('Impute 4446 of 18469 null dates in was_submitted_datetime_actual')
