# Reviewer_Tool_dev

JupyterReviewer is a package that integrates the manual review processes into Jupyter notebooks and computational analysis workflows.

# ReviewData object

The `ReviewData` object stores all relevant information regarding the data you need to review. The object is designed to eventually add or edit information for each item (row). Features include:

- Organized subtables for data you want to edit, supplementary information to view, and history of changes
- Stores subtables automatically
- Prevents overwriting
- Easy to share or pass review to other users

Instantiating a Review Data object requires a dataframe where each row corresponds to the item you want to review (like a mutation or a sample purity). Each row must have some unique index name.

In [456]:

import pandas as pd
import pathlib
import os
from IPython.display import display
from datetime import datetime, timedelta
import time

import plotly.express as px
from plotly.subplots import make_subplots
from jupyter_dash import JupyterDash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
from dash import Dash, dash_table
import dash
import dash_bootstrap_components as dbc

In [457]:
import pandas as pd
from datetime import datetime
import os
import numpy as np
import warnings

from enum import Enum
class AnnotationType(Enum):
    TEXT = 'text'
    TEXTAREA = 'textarea'
    NUMBER = 'number'
    CHECKLIST = 'checklist'
    RADIOITEM = 'radioitem'

class ReviewDataAnnotation:
    
    def __init__(self, name, 
                 annot_type: AnnotationType, 
                 options: []=[], 
                 validate_input=None,
                 default=None
                ):
        '''
        validate_input: a custom function to verify input. Returns a boolean
        '''
        self.name = name
        self.annot_type = annot_type
        self.options = options
        self.validate_input = validate_input
        self.default = default
        
    def validate(self, x):
        if len(self.options) > 0:
            for item in np.array([x]).flatten():
                if item not in self.options:
                    raise ValueError(f'Input {item} is not in the specified options {self.options} for annotation named {self.name}')
                
        if self.validate_input is not None:
            if not self.validate_input(x):
                raise ValueError(f'Input {x} is invalid for annotation {self.name}. Check validate_input method')
        

class ReviewData:
    
    def __init__(self, 
                 review_dir: str, # path to directory to save info
                 df: pd.DataFrame, # optional if directory above already exists. 
                 annotate_data: [ReviewDataAnnotation], # dictionary naming column and type of data (text, float, checkbox, radio)
                ):
        # check df index
        
        annotate_cols = [ann.name for ann in annotate_data]
        self.annotate_data = annotate_data
        
        self.review_dir = review_dir
        self.data_fn = f'{review_dir}/data.tsv'
        self.annot_fn = f'{review_dir}/annot.tsv'
        self.history_fn = f'{review_dir}/history.tsv'
        
        if not os.path.isdir(self.review_dir):
            os.mkdir(self.review_dir)
            self.data = df
            self.data.to_csv(self.data_fn, sep='\t')
            self.annot = pd.DataFrame(index=df.index, columns=annotate_cols) # Add more columns. If updating an existing column, will make a new one
            self.annot.to_csv(self.annot_fn, sep='\t')
            self.history = pd.DataFrame(columns=annotate_cols + ['index', 'timestamp']) # track all the manual changes, including time stamp
            self.history.to_csv(self.history_fn, sep='\t')
        else:
            self.data = pd.read_csv(self.data_fn, sep='\t', index_col=0)
            self.annot = pd.read_csv(self.annot_fn, sep='\t', index_col=0)
            self.history = pd.read_csv(self.history_fn, sep='\t', index_col=0)
            
        # Add additional annotation columns
        new_annot_cols = [c for c in annotate_cols if c not in self.annot.columns]
        self.annot[new_annot_cols] = np.nan
        
        for annot in self.annotate_data:
            if annot.annot_type in [AnnotationType.CHECKLIST, AnnotationType.RADIOITEM]:
                self.annot[annot_col] = self.annot[annot_col].astype(object)
        
        # Add additional columns to table
        if not df.equals(self.data):
            new_data_cols = [c for c in df.columns if c not in self.data.columns]
            not_new_data_cols = [c for c in df.columns if c in self.data.columns]
            self.data[new_data_cols] = df[new_data_cols]
            
            if not self.data[not_new_data_cols].equals(df[not_new_data_cols]):
                warnings.warn(f'Input data dataframe shares columns with existing data, but are not equal.\n' + 
                              f'Only adding columns {new_data_cols} to the ReviewData.data dataframe\n' + 
                              f'Remaining columns are not going to be updated.' + 
                              f'If you intend to change the ReviewData.data attribute, make a new session directory and prefill the annotation data')
            
    def pre_fill_annot(df: pd.DataFrame):
        self.annot.loc[df.index, [c for c in df.columns if c in self.annot.columns]] = df
        
    def _update(self, data_idx, series):
        self.annot.loc[data_idx, list(series.keys())] = list(series.values())
        series['timestamp'] = datetime.today()
        series['index'] = data_idx
        self.history = self.history.append(series, ignore_index=True)
        
        # write to file
        self.data.to_csv(self.data_fn, sep='\t')
        self.annot.to_csv(self.annot_fn, sep='\t')
        self.history.to_csv(self.history_fn, sep='\t')
        
        

In [458]:
bucket_0c1_cchu_manual_purity_review_session_dir = 'gs://taml_vm_analysis/data/Full-Analysis/1_Full-Analysis-2022-02-22_pran3/0c1_Manual_Purity_Review_cchu'
cchu_purities_df = pd.read_csv(f'{bucket_0c1_cchu_manual_purity_review_session_dir}/manual_purity_review_table.tsv', sep='\t', index_col=0)
cchu_purities_df



Unnamed: 0_level_0,BETA_FLAG_not_enough_drivers,BETA_annot_maf_fn,BETA_clonal_muts,BETA_clonal_muts_genes,BETA_half_purity,BETA_has_beta_solution,BETA_num_clonal_drivers,BETA_ploidy,BETA_purity,BETA_purity_lower,...,manual_purity,manual_purity_lower,manual_purity_upper,manual_ploidy,manual_confidence,manual_flags,last_manual_update,manual_method,MAFLITE,VCF
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000725_ZS_2668,True,/home/cchu/cgaprojects_ibm_tAML_analysis/data/...,,,,False,,2.0,0.000,0.000,...,0.630,0.570,0.690,2.01,"No purity called, unsure",Post_Allo,2022-02-23 21:43:37.104717,Manual_Other,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...
005982_GD_1875,False,/home/cchu/cgaprojects_ibm_tAML_analysis/data/...,[0],['RTEL1:p.A1062T'],0.488,True,1.0,2.0,0.976,0.860,...,0.910,0.860,0.960,1.95,Confident,,2022-02-23 21:44:00.961518,Keep_auto_call,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...
012413_AT_1634,False,/home/cchu/cgaprojects_ibm_tAML_analysis/data/...,[0],['BRCA1:p.V772A'],0.512,True,2.0,2.0,1.024,0.800,...,1.024,0.800,1.244,2.00,"Purity called, unsure",,2022-02-23 21:44:51.038330,Manual_BETA,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...
016198_VX_1736,False,/home/cchu/cgaprojects_ibm_tAML_analysis/data/...,[0],['TERT:p.R756H'],0.430,True,1.0,2.0,0.860,0.760,...,0.860,0.760,0.964,2.00,"Purity called, unsure",No CNA,2022-02-23 21:51:14.522862,Manual_BETA,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...
022613_PU_3426,True,/home/cchu/cgaprojects_ibm_tAML_analysis/data/...,,,,False,,2.0,0.000,0.000,...,0.460,0.390,0.530,1.88,Confident,No_AML_drivers,2022-02-23 21:53:02.173752,Keep_auto_call,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PQ9867BM,False,/home/cchu/cgaprojects_ibm_tAML_analysis/data/...,"[0, 1]","['TP53:p.W53*', 'ZNF318:p.R1936S']",0.400,True,2.0,2.0,0.800,0.688,...,0.864,0.708,1.024,2.00,Confident,Used DFCI flags to change Beta solution,2022-03-08 21:01:12.668613,Manual_BETA,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...
SA04142016,True,/home/cchu/cgaprojects_ibm_tAML_analysis/data/...,,,,False,,2.0,0.000,0.000,...,0.920,0.870,0.970,1.83,Confident,,2022-03-08 21:01:38.015167,Keep_auto_call,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...
SM120519BM-H,True,/home/cchu/cgaprojects_ibm_tAML_analysis/data/...,,,,False,,2.0,0.000,0.000,...,0.000,0.000,0.000,0.00,"No purity called, unsure","No CNA,No AML drivers",2022-03-08 21:02:25.093167,Manual_Other,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...
WD10052017BM,False,/home/cchu/cgaprojects_ibm_tAML_analysis/data/...,[0],['CTC1:p.R731W'],0.700,True,2.0,2.0,1.400,0.732,...,0.528,0.464,0.596,2.00,"Purity called, unsure",No CNA,2022-03-08 21:04:18.622309,Manual_BETA,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...,gs://fc-fed5ee4d-4de5-429a-b88e-681cde1f0558/a...


In [459]:
test_rd_dir = '/home/cchu/cgaprojects_ibm_tAML_analysis/data/test_getzlab-JupyterReviewer/Reviewer_Tutorial'
test_rd = ReviewData(review_dir=test_rd_dir,
                     df = cchu_purities_df, # optional if directory above already exists. 
                     annotate_data = [ReviewDataAnnotation('purity', 'number', validate_input=lambda x: x < 0.5),
                                      ReviewDataAnnotation('rating', 'number', options=range(10)),
                                      ReviewDataAnnotation('description', 'text'),
                                      ReviewDataAnnotation('class', 'radioitem', options=[f'Option {n}' for n in range(4)]),])
                     
#                      {'purity': 'number', 
#                                       'class': 'text', 
#                                       'rating': 'number', 
#                                       'description': 'text', 
#                                       'another_annot_col': 'checklist'})
test_rd.annot.head()

Unnamed: 0_level_0,purity,class,rating,description
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000725_ZS_2668,2.0,Option 3,2.0,asdf
005982_GD_1875,0.1,Option 0,12.0,asdfasdf
012413_AT_1634,0.4,Option 2,10.0,asdfsadf
016198_VX_1736,,,,
022613_PU_3426,,,,


# Simple widgets notebook reviewer

You can use ipython widgets to get interactivity 

# ReviewDataApp object

Use the functionality of ploty Dash to create advanced dashboards for visualizing and interacting with your data. This is made to wrap around any ReviewData object, so it is easy to edit and change as needed without undoing the underlying annotations in the ReviewData

In [460]:
class AppComponent:
    
    def __init__(self, name, components, callback=None, callback_output=[], callback_input=[], callback_state=[]):
        self.name = name
        self.component = html.Div(components)
        self.callback = callback
        self.callback_output = callback_output
        self.callback_input = callback_input
        self.callback_state = callback_state
        
        # TODO: option to update anotations
        # TODO: reset function (switching samples) and a page function
    
class TestApp:
    def __init__(self, review_data: ReviewData, components: [AppComponent]=[], host='0.0.0.0', port=8051):
        self.prop = None
        self.more_components = components
        self.review_data = review_data
        self.host = host
        self.port = port
        
        # check component ids are not duplicated
    
    
        
        
        
    def run_app(self, mode, host='0.0.0.0', port=8050):
        app = JupyterDash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
        app.layout = self.gen_layout()

        @app.callback(output=dict(data_id_header=Output(f'APP-submit-button-result', 'children'), 
                                  history_table=Output(f'APP-history-table', 'children'),
                                  annot_panel=self.annotation_panel_component.callback_output,
                                  more_component_outputs={c.name: c.callback_output for c in self.more_components}
                             ), 
                      inputs=dict(dropdown_value=Input('APP-dropdown-data-state', 'value'), 
                                  submit_annot_button=Input('APP-submit-button-state', 'n_clicks'),
                                  annot_input_state=self.annotation_panel_component.callback_state, #{annot.name: State(f"APP-{annot.name}-{annot.annot_type}-input-state", "value") for annot in self.review_data.annotate_data},
                                  more_component_inputs={c.name: c.callback_input for c in self.more_components}
                                 )
                     ) # TODO: add back more components
        def component_callback(dropdown_value, submit_annot_button, 
                               annot_input_state, 
                               more_component_inputs):
            
            ctx = dash.callback_context
            if not ctx.triggered:
                raise PreventUpdate
            else:
                prop_id = ctx.triggered[0]['prop_id'].split('.')[0]
            
            print(f'prop_id: {prop_id}')
            print(submit_annot_button)
            
            output_dict = {'data_id_header': dash.no_update, 
                           'history_table': dash.no_update, 
                           'annot_panel': {annot_col: dash.no_update for annot_col in self.review_data.annot.columns}, 
                           'more_component_outputs': {c.name: [dash.no_update for i in range(len(c.callback_output))] for c in self.more_components}}
            
            
            if prop_id == 'APP-dropdown-data-state':

                for i in range(len(self.more_components)):
                    component = self.more_components[i]
                    # reset vs row dependent
                    component_output = component.callback(*more_component_inputs[component.name])
                    output_dict['more_component_outputs'][component.name] = [component_output] # force this? specify names in the callback outputs?
                
                output_dict['data_id_header'] = dropdown_value
                output_dict['history_table'] = dbc.Table.from_dataframe(self.review_data.history.loc[self.review_data.history['index'] == dropdown_value])
                output_dict['annot_panel'] = {annot_col: '' for annot_col in self.review_data.annot.columns} # TODO set defaults?
                            
            elif (prop_id == 'APP-submit-button-state') & (submit_annot_button > 0):
                self.review_data._update(dropdown_value, annot_input_state)
                output_dict['history_table'] = dbc.Table.from_dataframe(self.review_data.history.loc[self.review_data.history['index'] == dropdown_value])
            
            else:
                # identify component that changed and which outputs are changed
                for i in range(len(self.more_components)):
                    component = self.more_components[i]
                    if sum([c.component_id == prop_id for c in self.more_components[i].callback_input]) > 0:
                        component_output = component.callback(*more_component_inputs[component.name])
                        output_dict['more_component_outputs'][component.name] = [component_output] # force this? specify names in the callback outputs?
                pass

            return output_dict
        
        app.run_server(mode=mode, host=host, port=port, debug=True) 
        
        
    def gen_annotation_panel_component(self):
        annotation_data = self.review_data.annotate_data
        
        submit_annot_button = html.Button(id='APP-submit-button-state', n_clicks=0, children='Submit')
        submit_annot_result = html.H1('Data', id='APP-submit-button-result')
        
        # history panel
        
        def annotation_input(annot: ReviewDataAnnotation):
            
            input_component_id = f"APP-{annot.name}-{annot.annot_type}-input-state"
            
            if annot.annot_type == AnnotationType.TEXTAREA.value:
                input_component = dbc.Textarea(size="lg", 
                                               id=input_component_id,
                                               value=annot.default,
                                              ), 
            elif annot.annot_type ==  AnnotationType.TEXT.value:
                input_component = dbc.Input(type="text", 
                                    id=input_component_id, 
                                    placeholder=f"Enter {annot.name}",
                                    value=annot.default,
                                   )
            elif annot.annot_type == AnnotationType.NUMBER.value:
                input_component = dbc.Input(type="number", 
                                    id=input_component_id, 
                                    placeholder=f"Enter {annot.name}",
                                    value=annot.default,
                                   )
            elif annot.annot_type == AnnotationType.CHECKLIST.value:
                flags = np.arange(0, 1, 0.2)
                input_component = dbc.Checklist(options=[{"label": f, "value": f} for f in annot.options],
                                                id=input_component_id, 
                                                value=annot.default),
            elif annot.annot_type == AnnotationType.RADIOITEM.value:
                # TODO: how to add in options
                input_component = dbc.RadioItems(
                                                options=[{"label": f, "value": f} for f in annot.options],
                                                value=annot.default,
                                                id=input_component_id,
                                            ),
            else:
                raise ValueError(f'Invalid annotation type "{annot.annot_type}"')
                
            return dbc.Row([dbc.Label(annot.name, html_for=input_component_id, width=2), dbc.Col(input_component)])
#             return dbc.Row(input_component)
        
            
        return AppComponent(name='APP-Panel',
                           components=[annotation_input(annot) for annot in self.review_data.annotate_data] + 
                                      [submit_annot_button, submit_annot_result], 
                           callback_output={annot.name: Output(f"APP-{annot.name}-{annot.annot_type}-input-state", "value") for annot in self.review_data.annotate_data},
                           callback_input=[Input('APP-submit-button-state', 'nclicks')],
                           callback_state={annot.name: State(f"APP-{annot.name}-{annot.annot_type}-input-state", "value") for annot in self.review_data.annotate_data}
                          )
        
    def gen_layout(self):
        
        dropdown = html.Div(dcc.Dropdown(options=self.review_data.data.index, 
                                         value=self.review_data.data.index[0], 
                                         id='APP-dropdown-data-state'))
        
        self.dropdown_component = AppComponent(name='APP-dropdown-component',
                                               components=[dropdown])
        
        history_table = html.Div([dbc.Table.from_dataframe(pd.DataFrame(columns=self.review_data.history.columns))], id='APP-history-table')
        self.history_component = AppComponent(name='APP-history-component',
                                               components=[history_table])
        
        self.annotation_panel_component = self.gen_annotation_panel_component()
        
        layout = html.Div([dbc.Row(self.dropdown_component.component, justify='end'),
                           dbc.Row([dbc.Col(self.annotation_panel_component.component),
                                    dbc.Col(self.history_component.component)
                                   ]),
                           dbc.Row([dbc.Row(c.component) for c in self.more_components])
                          ])

        return layout
    
    def add_table(self, component_name, col, table_cols):
        
        table = html.Div(dbc.Table.from_dataframe(pd.read_csv(self.review_data.data.iloc[0][col], sep='\t')[table_cols]), 
                                   id=component_name)
        table_component = AppComponent(component_name, [table], 
                                      lambda x: dbc.Table.from_dataframe(pd.read_csv(self.review_data.data.loc[x, col], sep='\t')[table_cols]),
                                      callback_output=[Output(component_name, 'children')], 
                                      callback_input=[Input('APP-dropdown-data-state', 'value')]
                                     )
        self.more_components.append(table_component)
    
    

In [461]:

radio_component = html.Div(
                            [
                                dbc.Label("Manual purity method"),
                                dbc.RadioItems(
                                    options=[
                                        {"label": "Keep auto call", "value": 'Keep_auto_call'},
                                        {"label": "Manual ABSOLUTE", "value": 'Manual_ABSOLUTE'},
                                        {"label": "Manual BETA", "value": 'Manual_BETA'},
                                        {"label": "Manual Other", "value": 'Manual_Other'},
                                    ],
                                    value='Keep_auto_call',
                                    id="purity-manual-method-radioitems",
                                ),
                            ]
                        )

output_component = html.H1('Data', id='header-sample-id')


radio_component_2 = html.Div(
                            [
                                dbc.Label("Manual purity method 2"),
                                dbc.RadioItems(
                                    options=[
                                        {"label": "Keep auto call", "value": 'Keep_auto_call'},
                                        {"label": "Manual ABSOLUTE", "value": 'Manual_ABSOLUTE'},
                                        {"label": "Manual BETA", "value": 'Manual_BETA'},
                                        {"label": "Manual Other", "value": 'Manual_Other'},
                                    ],
                                    value='Keep_auto_call',
                                    id="purity-manual-method-radioitems-2",
                                ),
                            ]
                        )

output_component_2 = html.H1('Data 2', id='header-sample-id-2')

def print_select(v):
    return v

a_component = AppComponent('a_component',
                           [radio_component, output_component], 
                           print_select, 
                           callback_output=[Output('header-sample-id', 'children')], 
                           callback_input=[Input('purity-manual-method-radioitems', 'value')])


test_app = TestApp(test_rd, [a_component])


test_app.add_table('maf', 'DFCI_local_sample_dfci_maf_fn', ['Hugo_Symbol', 'Chromosome', 't_alt_count', 't_ref_count', 'Tumor_Sample_Barcode'])
test_app.add_table('maf', 'DFCI_local_sample_dfci_maf_fn', ['Hugo_Symbol', 'Chromosome', 't_alt_count', 't_ref_count', 'Tumor_Sample_Barcode', 'Start_Position'])

test_app.run_app(mode='external', port=8056)


Exception in thread Thread-6873:
Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "/home/cchu/.local/lib/python3.7/site-packages/retrying.py", line 49, in wrapped_f
    return Retrying(*dargs, **dkw).call(f, *args, **kw)
  File "/home/cchu/.local/lib/python3.7/site-packages/retrying.py", line 212, in call
    raise attempt.get()
  File "/home/cchu/.local/lib/python3.7/site-packages/retrying.py", line 247, in get
    six.reraise(self.value[0], self.value[1], self.value[2])
  File "/usr/lib/python3/dist-packages/six.py", line 693, in reraise
    raise value
  File "/home/cchu/.local/lib/python3.7/site-packages/retrying.py", line 200, in call
    attempt = Attempt(fn(*args, **kwargs), attempt_number, False)
  File "/home/cchu/.local/lib/python3.7/site-packages/jupyter_dash/jupyter_app.py", line 292, in run


OSError: Address 'http://0.0.0.0:8056' already in use.
    Try passing a different port to run_server.

In [None]:
test_app = TestApp(test_rd)
test_app.add_annot_input()
test_app.add_more_components([list_of_components_predefined])
test_app.run_app(mode='external')