# Insights Module - Refine

This notebook demonstrates the utility of the OEA_py class notebook, and speeding up the process of refining/pseudonymizing the Insights data.

The steps outlined below describe how this notebook is used to refine the Microsoft Education Insights module tables:

- Set the workspace for where the tables are located. 
- 1 function is defined and used:
   1. **refine_corrected**: almost identical to the ```oea.refine()``` function, except reads from ```stage2/Ingested_Corrected``` rather than ```stage2/Ingested```.
   2. **refine_insights_dataset**: uses the Insights metadata.csv to pseudonymize each table according to whether it is to be hashed, masked, or has no-operation.

In [None]:
workspace = 'dev'

In [None]:
%run OEA_py

In [None]:
# 1) set the workspace (this determines where in the data lake you'll be writing to and reading from).
# You can work in 'dev', 'prod', or a sandbox with any name you choose.
# For example, Sam the developer can create a 'sam' workspace and expect to find his datasets in the data lake under oea/sandboxes/sam
oea.set_workspace(workspace)

In [None]:
# 2) this step refines the data through the use of metadata (this is where the pseudonymization of the data occurs).
def refine_corrected(entity_path, metadata=None, primary_key='id'):
    source_path = f'stage2/Ingested_Corrected/{entity_path}'
    primary_key = oea.fix_column_name(primary_key) # fix the column name, in case it has a space in it or some other invalid character
    path_dict = oea.parse_path(source_path)
    sink_general_path = path_dict['entity_parent_path'].replace('Ingested_Corrected', 'Refined') + '/general/' + path_dict['entity']
    sink_sensitive_path = path_dict['entity_parent_path'].replace('Ingested_Corrected', 'Refined') + '/sensitive/' + path_dict['entity'] + '_lookup'
    if not metadata:
        all_metadata = oea.get_metadata_from_path(path_dict['entity_parent_path'])
        metadata = all_metadata[path_dict['entity']]

    df_changes = oea.get_latest_changes(source_path, sink_general_path)
    spark_schema = oea.to_spark_schema(metadata)
    df_changes = oea.modify_schema(df_changes, spark_schema)        

    if df_changes.count() > 0:
        df_pseudo, df_lookup = oea.pseudonymize(df_changes, metadata)
        oea.upsert(df_pseudo, sink_general_path, primary_key) # todo: remove this assumption that the primary key will always be hashed during pseduonymization
        oea.upsert(df_lookup, sink_sensitive_path, primary_key)    
        oea.add_to_lake_db(sink_general_path)
        oea.add_to_lake_db(sink_sensitive_path)
        logger.info(f'Processed {df_changes.count()} updated rows from {source_path} into stage2/Refined')
    else:
        logger.info(f'No updated rows in {source_path} to process.')
    return df_changes.count()

def refine_insights_dataset(tables_source):
    items = oea.get_folders(tables_source)
    for item in items: 
        table_path = tables_source +'/'+ item
        if item == 'metadata.csv':
            logger.info('ignore metadata processing, since this is not a table to be ingested')
        else:
            try:
                if item == 'activity':
                    refine_corrected('M365/v1.14/activity', metadata[item], 'SignalId')
                elif item == 'AadGroup':
                    refine_corrected('M365/v1.14/AadGroup', metadata[item], 'ObjectId_pseudonym')
                elif item == 'AadGroupMembership':
                    refine_corrected('M365/v1.14/AadGroup', metadata[item], 'MembershipId')
                elif item == 'AadUser':
                    refine_corrected('M365/v1.14/AadUser', metadata[item], 'ObjectId_pseudonym')
                elif item == 'AadUserPersonMapping':
                    refine_corrected('M365/v1.14/AadUserPersonMapping', metadata[item], 'ObjectId_pseudonym')
                elif item == 'Person':
                    refine_corrected('M365/v1.14/Person', metadata[item], 'Id_pseudonym')
                elif item == 'PersonDemographic':
                    refine_corrected('M365/v1.14/PersonDemographic', metadata[item], 'PersonId_pseudonym')
                else:
                    refine_corrected('M365/v1.14/' + item, metadata[item], 'Id')
            except AnalysisException as e:
                # This means the table may have not been properly refined due to errors with the primary key not aligning with columns expected in the lookup table.
                pass
            
            logger.info('Refined table: ' + item + ' from: ' + table_path)

In [None]:
metadata = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_catalog/Microsoft_Education_Insights/test_data/metadata.csv')
refine_insights_dataset('stage2/Ingested/M365/v1.14')