##### Variable Descriptive Statistics

###### Categorical / Identifier Variables

In [None]:
for f in shared.identifierVars:
  print(f)
  dfFlt.agg(
    F.countDistinct(f).alias('unique'),
    F.count(F.when(F.col(f).isNull(), f)).alias('null')
  ).show()

Totals
- org_id: 3
  - ID per organization
- section_id: 46
  - ID per section/school
- learner_id: 1126
  - ID per learner/student
- assessment_id: 329
  - ID per each assessment/quiz.
- assessment_instance_id: 615
  - ID of each time a section uses the assessment
- assessment_instance_attempt_id: 8483 null: 3264
  - ID of each time an assessment is attempted by a student
- assessment_item_response_id: 64368 null: 15710
  - ID of each response/answer to an assessment question
- learner_assignment_attempt_id: 8855
  - ID of each learner attempt of an assigned assessment
- learner_assigned_item_attempt_id: 79689
  - ID of each learner attempt of a item/question on an assigned assessment

###### Categorical / Nominal Variables

In [None]:
# Categorical / Nominal Values
for f in shared.nominalVars:
  dfFlt.groupBy(f).count().orderBy('count', ascending=False).show(50, False)


##### Null Values
- response_correctness
  - 3298 null values
    - investigate further
  - 1566 "[unassigned]"
    - investigate further for relation to the 1566 assignment_attempt_number and assignment_max_attempts values
  - Could be null because the question wasn't answered or a different method of scoring the question
- item_type_code_name
  - Investigate further
  - Could be related to unstarted or unanswered questions

##### Large number of categorical values
- item_type_code_name
  - Need to transform by reclassifying to reduce number of levels

##### learner_attempt_status
- Unary "fully scored"

In [None]:
# Create Todo list
add_todo('Investigate 3298 null values in response_correctness')
add_todo('Investigate 1566 [unassigned] in response_correctness')
add_todo('Investigate null values in item_type_code_name')
add_todo('Reduce number of levels in item_type_code_name')
add_todo('Remove unary learner_attempt_status = "fully scored"')

###### Categorical / Nominal Correlations Heatmap

In [None]:
corr, ax = associations( dfFlt.select(*shared.nominalVars).toPandas(), nan_replace_value='null' )

- High correlation between item_type_code_name and scoring_type_code
- learner_attempt_status a single value ('fully scored)

###### Display item_type_code_name and scoring_type_code Corelation

In [None]:
def item_to_code():
  dfPd = dfFlt.select('item_type_code_name','scoring_type_code').toPandas()
  return pd.crosstab(dfPd.item_type_code_name.fillna('null'), dfPd.scoring_type_code.fillna('null'), margins=True, margins_name="Total")

item_to_code()

- item_type_code_name indicates the scoring_type_code
  - scoring_type_code = '[unnassigned]' for item_type_code_name is..
    - shortAnswer, null, essay (9 times), RubricResponse, MultipleChoiceResponse, FillInBlankResponse
  - scoring_type_code = external only for item_type_code_name = aheAlgo
  - scoring_type_code = 'manual' for item_type_code_name is..
    - essay, fileUpload, shortAnswer

###### Numerical / Continuous Variables

In [None]:
def num_cont_vars():
  # Numerical / Continuous Variables
  desc = dfFlt.describe()
  for f in shared.continousVars:
    desc.select("summary", f).show(5,False)

num_cont_vars()

###### Numerical / Continuous Histograms

In [None]:
for f in shared.continousVars:
  print(f)
  dfFlt.select(f).toPandas().hist()

Normal Distribution
- number_of_learners
- possible outliers greaterthan 40
- investigate further

Right Skewed
- final_score_unweighted
- number_of_distinct_instance_items
- points_possible_unweighted

Binary Values (0/1)
- assignment_max_attempts
- assignment_attempt_number
- Appears binary but the variable name indicates it could have any values. The data only contains 1 and 0
- Investigate further

In [None]:
add_todo("Investigate number_of_learners > 40 outliers")
add_todo("Investigate binary variables assignment_attempt_number and assignment_max_attempts")

###### Numerical / Continuous Correlations Heatmap

In [None]:
corr, ax = associations( dfFlt.select(*shared.continousVars).toPandas(), nan_replace_value='null' )

- assignment_attempt_number and assignment_max_attempts highly correlated

###### Display assignment_attempt_number and assignment_max_attempts Correlation

In [None]:
def attempt_to_max():
  dfPd = dfFlt.select('assignment_attempt_number', 'assignment_max_attempts').toPandas()
  return pd.crosstab(dfPd.assignment_attempt_number.fillna('null'), dfPd.assignment_max_attempts.fillna('null'), margins=True, margins_name="Total")

attempt_to_max()

- assignment_max_attempts 1 is always assignment_attempt_number = 1

###### Null and Zero Numerical / Continuous Variables

In [None]:
for c in shared.continousVars:
  print(c)
  dfFlt.agg(
    F.count(F.when(F.col(c).isNull(), c)).alias('null'),
    F.count(F.when(F.col(c) == 0, c)).alias("zero")
  ).show()

- possibile correlation between assignment_attempt_number and assignment_max_attempts
  - both have 1566 zero values
  - Needs further investigation
- final_score_unweighted has 16,582 zero values
  - possibly because not yet scored
  - needs further investigation

In [None]:
add_todo('Investigate assignment_attempt_number and assignment_max_attempts both have 1566 values')
add_todo('Investigate final_score_unweighted has 16,582 zero values')

###### Categorical / Interval Variables

In [None]:
for f in shared.intervalVars:
  print (f)
  dfFlt.agg(
    F.countDistinct(f).alias('unique'),
    F.count(F.when(F.col(f).isNull(), f)).alias('null'),
    F.min(f).alias('min'),
    F.max(f).alias('max')
 ).show(1, False)

Default Date Values
- All variables have some dates have default values '2999-01-01 00:00:00' as max and '1900-01-01 00:00:00' as min
- Almost all interval variables
  - except max_student_stop_datetime and max_student_stop_datetime
- These are substitutes for no value and will need to be replaced nulls
- Further investigation is needed as what the nulls mean

In [None]:
add_todo('Replace default dates with nulls')
add_todo('Investigate why some dates are null')

###### Categorical / Interval Correlations Heatmap

In [None]:
corr, ax = associations( dfFlt.select(* (F.unix_timestamp(c).alias(c) for c in shared.intervalVars) ).toPandas(), figsize=[10,10] )

- Correlated dates
  - assignment_due_date and assignment_final_submission_date
  - assigment_due_date and assignment_start_date
  - min_student_start_datetime and max_student_start_datetime
  - student_start_datetime and student_stop_datetime
  - student_start_datetime and was_fully_scored_datetime
  - student_stop_datetime and was_fully_scored_datetime
- Correlation appears to be because the events closely follow each other

###### Categorical / Interval Variables

In [None]:
# Categorical / Interval Variables

for f in shared.intervalVars:
  print (f)
  dfFlt.agg(
    F.countDistinct(f).alias('unique'),
    F.count( F.when(F.col(f).isNull(), f)).alias('null'),
    F.min( F.when(F.col(f).isNull() == False, F.col(f) )).alias('min'),
    F.max( F.when(F.col(f).isNull() == False, F.col(f) )).alias('max')
 ).show(1, False)

Dates fall in range of a school year of 8/2019 to 5/2020

###### Binary Variables

In [None]:
# Categorical / Nominal Values
for f in shared.binaryVars:
  dfFlt.groupBy(f).count().orderBy('count', ascending=False).show(50, False)

Variables With Unary Values
- assignment_late_submission and is_deleted
- is_manual_scoring_required has only one true value
- Variables will be removed

###### Binary Correlations Heatmap

In [None]:
corr, ax = associations( dfFlt.select(*shared.binaryVars).toPandas(), nan_replace_value='null' )

- No correlation

In [None]:
add_todo("Remove variables assignment_late_submission, is_manual_scoring_required and is_deleted")

###### Categorical / Nominal and Binary Correlations Heatmap

In [None]:
corr, ax = associations( dfFlt.select(*shared.binaryVars, *shared.nominalVars).toPandas(), nan_replace_value='null', figsize=[10,10] )

- New correlations
  - is_affecting_grade and ced_assignment_type_code
  - item_is_offline_scored and assigned_item_status

###### Display is_affecting_grade and ced_assignment_type_code Correlation

In [None]:
def grade_to_code():
  dfPd = dfFlt.select('is_affecting_grade','ced_assignment_type_code').toPandas()
  return pd.crosstab(dfPd.is_affecting_grade.fillna('null'), dfPd.ced_assignment_type_code.fillna('null'), margins=True, margins_name="Total")

grade_to_code()

- is_affecting_grade = false when ced_assignment_type_code = practice

###### Display item_is_offline_scored and assigned_item_status Correlation

In [None]:
def offline_to_status():
  dfPd = dfFlt.toPandas()
  return pd.crosstab(dfPd.item_is_offline_scored.fillna('null'), dfPd.assigned_item_status.fillna('null'), margins=True, margins_name="Total")

offline_to_status()

- item_is_offline_scored = True is always assigned_item_status = offline_scored

###### Num Sections by Organization

In [None]:
def sections_by_org():
  sByO = dfFlt.groupBy('org_new').agg(F.countDistinct('section_id').alias('sections')).orderBy('org_new')
  pdDf = sByO.toPandas()

  # Add mean
  meanSections = sByO.agg(F.round(F.avg(F.col('sections'))).alias('mean')).collect()[0][0]
  # Append row with mean
  pdDf = pdDf.append({'org_new': 'mean', 'sections': meanSections}, ignore_index=True)

  ax = pdDf.plot.bar('org_new','sections', title='Num Sections by Organization')
  annotate_plot(ax)
  plt.show()

sections_by_org()

###### Num Learners by Organization

In [None]:
def learners_by_org():
  lByO = dfFlt.groupBy('org_new').agg(F.countDistinct('learner_id').alias('learners')).orderBy('org_new')
  pdDf = lByO.toPandas()

  # Add mean
  meanlearners = lByO.agg(F.round(F.avg(F.col('learners'))).alias('mean')).collect()[0][0]
  # Append row with mean
  pdDf = pdDf.append({'org_new': 'mean', 'learners': meanlearners}, ignore_index=True)

  ax = pdDf.plot.bar('org_new','learners', title='Num Learners by Organization')
  annotate_plot(ax)
  plt.show()

learners_by_org()

###### Mean Section Learners by Organization

In [None]:
def sec_learners_by_org():
  lByS = dfFlt.groupBy('org_new','section_id').agg(F.countDistinct('learner_id').alias('learners'))
  # Av
  lBySMean = lByS.groupBy('org_new').agg( F.avg('learners').alias('learners') ).orderBy('org_new')
  pdDf = lBySMean.toPandas()

  # Add mean
  meanlearners = lBySMean.agg(F.round(F.avg(F.col('learners'))).alias('mean')).collect()[0][0]
  # Append row with mean
  pdDf = pdDf.append({'org_new': 'mean', 'learners': meanlearners}, ignore_index=True)

  ax = pdDf.plot.bar('org_new','learners', title='Mean Section Learners by Organization')
  annotate_plot(ax)
  plt.show()

sec_learners_by_org()

###### Mean Section Assessments by Organization

In [None]:
def sec_assess_by_org():
  assessBySection = dfFlt.groupBy('org_new','section_id').agg(F.countDistinct('assessment_id').alias('assessments'))
  aByS = assessBySection.groupBy('org_new').agg( F.avg('assessments').alias('assessments') ).orderBy('org_new')

  pdDf = aByS.toPandas()

  # Add mean
  meanAssess = aByS.agg(F.round(F.avg(F.col('assessments'))).alias('mean')).collect()[0][0]
  # Append row with mean
  pdDf = pdDf.append({'org_new': 'mean', 'assessments': meanAssess}, ignore_index=True)


  ax = pdDf.plot.bar('org_new','assessments', title='Mean Section Assessments by Organization')
  annotate_plot(ax)
  plt.show()

sec_assess_by_org()

###### Mean Learners Assessments by Organization

In [None]:
def assess_by_org():
  assessBySection = dfFlt.groupBy('org_new','learner_id').agg(F.countDistinct('assessment_id').alias('assessments'))
  aByS = assessBySection.groupBy('org_new').agg( F.avg('assessments').alias('assessments') ).orderBy('org_new')

  pdDf = aByS.toPandas()

  # Add mean
  meanAssess = aByS.agg(F.round(F.avg(F.col('assessments'))).alias('mean')).collect()[0][0]
  # Append row with mean
  pdDf = pdDf.append({'org_new': 'mean', 'assessments': meanAssess}, ignore_index=True)

  ax = pdDf.plot.bar('org_new','assessments', title='Mean Learners Assessments by Organization')
  annotate_plot(ax)
  plt.show()

assess_by_org()

###### Mean Scores by Organization

In [None]:
def scores_by_orgs():
  sByO = dfFlt.groupBy('org_new').agg( F.avg('final_score_unweighted').alias('scores') ).orderBy('org_new')

  pdDf = sByO.toPandas()

  # Add mean
  meanAssess = sByO.agg(F.round(F.avg(F.col('scores'))).alias('mean')).collect()[0][0]
  # Append row with mean
  pdDf = pdDf.append({'org_new': 'mean', 'scores': meanAssess}, ignore_index=True)

  ax = pdDf.plot.bar('org_new','scores', title='Mean Scores by Organization')
  annotate_plot(ax)
  plt.show()

scores_by_orgs()
