# Moodle Module - Refine

This notebook demonstrates the utility of the OEA_py class notebook, and speeding up the process of refining/pseudonymizing the Moodle data. 

The steps outlined below describe how this notebook is used to refine tables originally from the Moodle data source:

- Set the workspace for where the tables are located. 
- 3 functions are defined and used:
   1. **refine_moodle**: almost identical to the ```oea.refine()``` function, except removes the assumption that primary keys are always hashed.
   2. **refine_moodle_dataset**: uses a simple function that iterate through refining Moodle table currently contained in ```stage2/Ingested/moodle``` of the data lake.
   3. **oea.add_to_lake_db**: uses the OEA_py function to add the refined Moodle tables that are not automatically added, to the lake db.

In [None]:
workspace = 'dev'
version = '4.1'

In [None]:
%run OEA_py

In [None]:
# 1) set the workspace (this determines where in the data lake you'll be writing to and reading from).
# You can work in 'dev', 'prod', or a sandbox with any name you choose.
# For example, Sam the developer can create a 'sam' workspace and expect to find his datasets in the data lake under oea/sandboxes/sam
oea.set_workspace(workspace)

In [None]:
def refine_moodle(entity_path, metadata=None, primary_key='id'):
    source_path = f'stage2/Ingested/{entity_path}'
    primary_key = oea.fix_column_name(primary_key) # fix the column name, in case it has a space in it or some other invalid character
    path_dict = oea.parse_path(source_path)
    sink_general_path = path_dict['entity_parent_path'].replace('Ingested', 'Refined') + '/general/' + path_dict['entity']
    sink_sensitive_path = path_dict['entity_parent_path'].replace('Ingested', 'Refined') + '/sensitive/' + path_dict['entity'] + '_lookup'
    if not metadata:
        all_metadata = oea.get_metadata_from_path(path_dict['entity_parent_path'])
        metadata = all_metadata[path_dict['entity']]

    df_changes = oea.get_latest_changes(source_path, sink_general_path)
    spark_schema = oea.to_spark_schema(metadata)
    df_changes = oea.modify_schema(df_changes, spark_schema)        

    if df_changes.count() > 0:
        df_pseudo, df_lookup = oea.pseudonymize(df_changes, metadata)
        oea.upsert(df_pseudo, sink_general_path, primary_key) # todo: remove this assumption that the primary key will always be hashed during pseduonymization
        oea.upsert(df_lookup, sink_sensitive_path, primary_key)    
        oea.add_to_lake_db(sink_general_path)
        oea.add_to_lake_db(sink_sensitive_path)
        logger.info(f'Processed {df_changes.count()} updated rows from {source_path} into stage2/Refined')
    else:
        logger.info(f'No updated rows in {source_path} to process.')
    return df_changes.count()

In [None]:
# 2) this step refines the data through the use of metadata (this is where the pseudonymization of the data occurs).
def refine_moodle_dataset(tables_source):
    items = oea.get_folders(tables_source)
    for item in items: 
        table_path = tables_source +'/'+ item
        if item == 'metadata.csv':
            logger.info('ignore metadata processing, since this is not a table to be ingested')
        else:
            try:
                if item == 'assign':
                    refine_moodle(f'moodle/v{version}/assign', metadata[item], 'id_pseudonym')
                elif item == 'user':
                    refine_moodle(f'moodle/v{version}}/user', metadata[item], 'id_pseudonym')
                else:
                    refine_moodle(f'moodle/v{version}/{item}', metadata[item], 'id')
            except AnalysisException as e:
                # This means the table may have not been properly refined due to errors with the primary key not aligning with columns expected in the lookup table.
                pass
            
            logger.info('Refined table: ' + item + ' from: ' + table_path)
    logger.info('Finished refining Moodle tables')

In [None]:
metadata = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_catalog/Moodle/test_data/metadata.csv')
refine_moodle_dataset(f'stage2/Ingested/moodle/v{version}}')

In [None]:
# Add the refined tables to the lake db manually (code currently does not handle auto-adding tables to the lake db, when the lookup table is not created)
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/assign_grades')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/assign_submission')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/assign_user_mapping')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/assignsubmission_file')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/cohort')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/course')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/course_categories')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/enrol')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/forum')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/forum_discussions')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/forum_grades')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/forum_posts')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/lesson')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/lesson_answers')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/lesson_attempts')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/lesson_grades')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/lesson_pages')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/lesson_timer')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/page')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/quiz')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/quiz_attempts')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/quiz_grades')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/role')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/role_assignments')
oea.add_to_lake_db(f'stage2/Refined/moodle/v{version}/general/user_enrolments')