# Graph Module - Refine

This notebook demonstrates the utility of the OEA_py class notebook, and speeding up the process of refining/pseudonymizing the Graph data.

The steps outlined below describe how this notebook is used to refine the Microsoft Graph module tables:

- Set the workspace for where the tables are located. 
- 1 function is defined and used:
   1. **refine_corrected**: almost identical to the ```oea.refine()``` function, except reads from ```stage2/Ingested_Corrected``` rather than ```stage2/Ingested```.
   2. **refine_graph_dataset**: uses the Graph metadata_beta.csv and metadata_v1p0.csv to pseudonymize each table according to whether it is to be hashed, masked, or has no-operation.

In [1]:
workspace = 'dev'
testdataSet = 'hed'

StatementMeta(spark3p2med, 67, 1, Finished, Available)

In [2]:
%run OEA_py

StatementMeta(, 67, -1, Finished, Available)

2023-01-12 18:28:27,631 - OEA - INFO - Now using workspace: dev
2023-01-12 18:28:27,633 - OEA - INFO - OEA initialized.


In [3]:
# 1) set the workspace (this determines where in the data lake you'll be writing to and reading from).
# You can work in 'dev', 'prod', or a sandbox with any name you choose.
# For example, Sam the developer can create a 'sam' workspace and expect to find his datasets in the data lake under oea/sandboxes/sam
oea.set_workspace(workspace)

StatementMeta(spark3p2med, 67, 3, Finished, Available)

2023-01-12 18:28:28,547 - OEA - INFO - Now using workspace: dev


In [8]:
# 2) this step refines the data through the use of metadata (this is where the pseudonymization of the data occurs).
def refine_corrected(entity_path, metadata=None, primary_key='id'):
    source_path = f'stage2/Ingested_Corrected/{entity_path}'
    primary_key = oea.fix_column_name(primary_key) # fix the column name, in case it has a space in it or some other invalid character
    path_dict = oea.parse_path(source_path)
    sink_general_path = path_dict['entity_parent_path'].replace('Ingested_Corrected', 'Refined') + '/general/' + path_dict['entity']
    sink_sensitive_path = path_dict['entity_parent_path'].replace('Ingested_Corrected', 'Refined') + '/sensitive/' + path_dict['entity'] + '_lookup'
    if not metadata:
        all_metadata = oea.get_metadata_from_path(path_dict['entity_parent_path'])
        metadata = all_metadata[path_dict['entity']]

    df_changes = oea.get_latest_changes(source_path, sink_general_path)
    spark_schema = oea.to_spark_schema(metadata)
    df_changes = oea.modify_schema(df_changes, spark_schema)        

    if df_changes.count() > 0:
        df_pseudo, df_lookup = oea.pseudonymize(df_changes, metadata)
        oea.upsert(df_pseudo, sink_general_path, primary_key) # todo: remove this assumption that the primary key will always be hashed during pseduonymization
        oea.upsert(df_lookup, sink_sensitive_path, primary_key)    
        oea.add_to_lake_db(sink_general_path)
        oea.add_to_lake_db(sink_sensitive_path)
        logger.info(f'Processed {df_changes.count()} updated rows from {source_path} into stage2/Refined')
    else:
        logger.info(f'No updated rows in {source_path} to process.')
    return df_changes.count()

def refine_graph_dataset(tables_source):
    items = oea.get_folders(tables_source)
    for item in items: 
        table_path = tables_source +'/'+ item
        if item == 'metadata.csv':
            logger.info('ignore metadata processing, since this is not a table to be ingested')
        else:
            try:
                if item == 'users':
                    refine_corrected('graph_api/beta/' + item, metadata_beta[item], 'userPrincipalName_pseudonym')
                elif item == 'm365_app_user_detail':
                    refine_corrected('graph_api/beta/' + item, metadata_beta[item], 'm365Activity_pk_pseudonym')
                elif item == 'teams_activity_user_detail':
                    refine_corrected('graph_api/beta/' + item, metadata_beta[item], 'teamsActivity_pk_pseudonym')
                elif item == 'meeting_attendance_report':
                    refine_corrected('graph_api/v1.0/' + item, metadata_v1p0[item], 'meetingUserId_pk_pseudonym')
                else:
                    logger.info('No defined function for processing this Graph table.')
            except AnalysisException as e:
                # This means the table may have not been properly refined due to errors with the primary key not aligning with columns expected in the lookup table.
                pass
            
            logger.info('Refined table: ' + item + ' from: ' + table_path)

StatementMeta(spark3p2med, 67, 8, Finished, Available)

In [9]:
metadata_beta = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_catalog/Microsoft_Graph/test_data/metadata_beta.csv')
metadata_v1p0 = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_catalog/Microsoft_Graph/test_data/metadata_v1p0.csv')
if testdataSet == 'k12':
    refine_graph_dataset('stage2/Ingested_Corrected/graph_api/beta')
    logger.info('Finished refining Graph K-12 dataset')
elif testdataSet == 'hed':
    refine_graph_dataset('stage2/Ingested_Corrected/graph_api/beta')
    refine_graph_dataset('stage2/Ingested_Corrected/graph_api/v1.0')
    logger.info('Finished refining Graph HEd dataset')
else:
    logger.info('Unrecognized testdataSet - please choose either k12 or hed.')

StatementMeta(spark3p2med, 67, 9, Finished, Available)

2023-01-12 18:38:35,455 - OEA - INFO - Processed 512 updated rows from stage2/Ingested_Corrected/graph_api/beta/m365_app_user_detail into stage2/Refined
2023-01-12 18:38:36,474 - OEA - INFO - Refined table: m365_app_user_detail from: stage2/Ingested_Corrected/graph_api/beta/m365_app_user_detail
2023-01-12 18:38:45,103 - OEA - INFO - Processed 497 updated rows from stage2/Ingested_Corrected/graph_api/beta/teams_activity_user_detail into stage2/Refined
2023-01-12 18:38:45,607 - OEA - INFO - Refined table: teams_activity_user_detail from: stage2/Ingested_Corrected/graph_api/beta/teams_activity_user_detail
2023-01-12 18:38:54,621 - OEA - INFO - Processed 600 updated rows from stage2/Ingested_Corrected/graph_api/beta/users into stage2/Refined
2023-01-12 18:38:55,025 - OEA - INFO - Refined table: users from: stage2/Ingested_Corrected/graph_api/beta/users
2023-01-12 18:39:04,064 - OEA - INFO - Processed 10170 updated rows from stage2/Ingested_Corrected/graph_api/v1.0/meeting_attendance_report