<a target="_blank" href="https://colab.research.google.com/github/giordamaug/HELP/blob/main/HELPpy/notebooks/prediction.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://www.kaggle.com/notebooks/welcome?src=https://github.com/giordamaug/HELP/blob/main/HELPpy/notebooks/prediction.ipynb">
  <img src="https://kaggle.com/static/images/open-in-kaggle.svg" alt="Open In Colab"/>
</a>

### 1. Install HELP from GitHub
Skip this cell if you already have installed HELP.

In [None]:
!pip install git+https://github.com/giordamaug/HELP.git

### 2. Download the input files
For a chosen tissue (here `Kidney`), download from GitHub the label file (here `Kidney_HELP.csv`, computed as in Example 1) and the attribute files (here BIO `Kidney_BIO.csv`, CCcfs `Kidney_CCcfs_1.csv`, ..., `Kidney_CCcfs_5.csv`, and N2V `Kidney_EmbN2V_128.csv`).  

Skip this step if you already have these input files locally.

In [None]:
tissue='Kidney'
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_HELP.csv
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_BIO.csv
for i in range(5):
  !wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_CCcfs_{i}.csv
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_EmbN2V_128.csv

In [20]:
import ipywidgets as wid
from typing import List
import matplotlib.pyplot as plt
from HELPpy.models.labelling import labelling
from HELPpy.utility.selection import select_cell_lines, delrows_with_nan_percentage
from HELPpy.preprocess.loaders import feature_assemble
import pandas as pd
import numpy as np
import os
from ipyfilechooser import FileChooser
from IPython.display import HTML as html_print
from IPython.display import display

def cstr(s, color='black'):
    return "<text style=color:{}>{}</text>".format(color, s)

def print_color(t):
    display(html_print(' '.join([cstr(ti, color=ci) for ti,ci in t])))
    
def pipeline(path: str=os.getcwd(), filename: str='', modelname:str='', rows: int=5, minlines=10, percent = 100.0, line_group='OncotreeLineage', line_col='ModelID', verbose=False):
    """
    Generate an interactive widget for labeling cell lines based on specified criteria.

    Parameters
    ----------
    path : str
        path for input file loading.
    filename : str
        name of CRISPR effect input file.
    modelname : str
        name of Model input file.
    rows : int, optional
        The number of rows to display in the widget for selecting tissues (default is 5).
    minlines : int, optional
        Minimum number of cell lines for tissue/lineage to be considered (default is 1).
    line_group : str, optional
        The column in 'df_map' to use for tissue selection (default is 'OncotreeLineage').
    line_col : str, optional
        The column in 'df_map' to use for line selection (default is 'ModelID').

    Returns
    -------
    ipywidgets.ValueWidget
        Widget containing the labeled cell lines.
    """
    df_map = None
    df = None
    df_orig = None
    val = wid.ValueWidget()
    val.value = pd.DataFrame(), df, df_orig, df_map 
    tissue_list = []
    selector_list = []
    #tissue_list = [tissue for tissue in np.unique(df_map[line_group].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[line_group] == tissue][line_col].values)) >= 1]
    layout_hidden  = wid.Layout(visibility = 'hidden')
    layout_visible = wid.Layout(visibility = 'visible')

    nanrem_set = wid.SelectionSlider(
        options=range(0, 101),
        value=int(percent),
        description='Nan %:',
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        tooltip='set percentage of nan allowed in genes',
    )
    def nanrem_set_changed(b):
        #global df, df_orig
        try:
            df = delrows_with_nan_percentage(val.value[2], perc=float(nanrem_set.value))
            df_orig = val.value[2]
            val.value = val.value[0], df, val.value[2], val.value[3]
            with out3:
                out3.clear_output()
                print_color(((f'Removed {len(df_orig)-len(df)}/{len(df_orig)} rows (with at least {nanrem_set.value}% NaN)', 'green'),))
        except:
            pass 
    nanrem_set.observe(nanrem_set_changed, names='value')

    minline_set = wid.SelectionSlider(
        options=range(1, 100),
        value=minlines,
        description='Min lines:',
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        tooltip='set minimum number of lines for the tissue',
    )
    def minline_set_changed(b):
        df = val.value[1]
        df_map = val.value[3]
        try:
            tissue_list = [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= minline_set.value]
            seltissue.options = ['__all__'] +  tissue_list
            seltissue.value=['__all__']
        except:
            pass
    minline_set.observe(minline_set_changed, names='value')
    selselector = wid.Dropdown(
        options=['__all__'] + selector_list,
        value='__all__',
        description='Selector:',
        tooltip = 'select the group type of lines',
        disabled=False,
    )
    def selselector_changed(b):
        df = val.value[1]
        df_map = val.value[3]
        if selselector.value != ():
            tissue_list = [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= minline_set.value]
            seltissue.description = selselector.value
            seltissue.options = ['__all__'] +  tissue_list
            seltissue.value=['__all__']
            with out1:
                out1.clear_output()
                display(selselector.value)
    selselector.observe(selselector_changed, names='value')
    seltissue = wid.SelectMultiple(
        options=['__all__'] + tissue_list if tissue_list != [] else [],
        value=['__all__'] if tissue_list != [] else [],
        rows=rows,
        description=line_group if line_group in selselector.options else '',
        tooltip = 'select lines by the chosen group',
        disabled=False
    )
    def seltissue_changed(b):
        if seltissue.value != ():
            if seltissue.value == ('__all__',):
                save_textbox.value = f"PanTissue.csv"
            else:
                save_textbox.value = f"{'_'.join([str(s).replace(' ','-').replace('/','-') for s in seltissue.value if str(s) != '__all__'])}.csv"
            with out1:
                out1.clear_output()
                display(seltissue.value)
    seltissue.observe(seltissue_changed, names='value')
    saveto_but = wid.Button(description="Save ...", button_style='primary')
    def on_savebutton_clicked(b):
        if save_textbox.value != '':
            try:
                val.value[0].to_csv(save_textbox.value, index=True)
                with out4:
                    out4.clear_output()
                    print_color(((f'Saved dataset to file: {save_textbox.value}.', 'green'),))
            except:
                with out4:
                    out4.clear_output()
                    print_color(((f'Problem saving label file (maybe empty)!', 'red'),))
        else:
            with out4:
                out4.clear_output()
                print_color(((f'Set a non empty filename!', 'red'),))
    saveto_but.on_click(on_savebutton_clicked)

    mode_buttons = wid.ToggleButtons(
        options=["E|NE", "E|aE|sNE", "E|(aE|sNE)"],
        button_style='success',
        description='',
        tooltips=['2 classes (one division)', '3 classes (one division)', '3 classes (two-times subdivision)'],
    )
    selmode_button = wid.Checkbox(
        value=False,
        description='Nested',
        disabled=False,
        indent=False
    )
    save_textbox = wid.Text(
        value="",
        description='',
    )
    save_textbox.layout = layout_visible
    button = wid.Button(description="Apply ...", button_style='primary')
    def on_button_clicked(b):
        df = val.value[1]
        df_map = val.value[3]
        with out1:
            out1.clear_output()
            print_color(((f'Labelling {len(df)} genes of {seltissue.value} ...', 'orange'),))
        with out2:
            out2.clear_output()
            if seltissue.value == ('__all__',):
                selector = [x for x in seltissue.options if x != '__all__']
            else:
                selector = [x for x in seltissue.value if x != '__all__']
            #display(selector)
            cell_lines = select_cell_lines(df, df_map, selector, line_group=line_group, line_col=line_col, 
                                            nested = selmode_button.value, verbose=verbose)
            if mode_buttons.value == "E|(aE|sNE)":
                mode = 'two-by-two' 
                nclasses = 3
                labelnames = {0: 'E', 1: 'aE', 2: 'sNE'}
            else:
                mode = 'flat-multi' 
                if mode_buttons.value == "E|NE":
                    nclasses = 2
                    labelnames = {0: 'E', 1: 'NE'}
                else:
                    nclasses = 3
                    labelnames = {0: 'E', 1: 'aE', 2: 'sNE'}
            val.value = labelling(df, columns=cell_lines, mode=mode, n_classes=nclasses, labelnames=labelnames, verbose=verbose), df, val.value[2], val.value[3]
        with out1:
            out1.clear_output()
            print_color(((f'DONE', 'green'),))
    button.on_click(on_button_clicked)
    out1 = wid.Output()
    out2 = wid.Output()
    out3 = wid.Output()
    out4 = wid.Output()

    # Create and display a FileChooser widget
    if filename != '':
        fc1 = FileChooser(path, title='Choose CRISPR effect file', filter='*.csv', filename=filename, select_default=True)
        try:
            df_orig = pd.read_csv(fc1.selected).rename(columns={'Unnamed: 0': 'gene'}).rename(columns=lambda x: x.split(' ')[0]).set_index('gene').T
            df = delrows_with_nan_percentage(df_orig, perc=float(nanrem_set.value))
            val.value = val.value[0],df, df_orig, val.value[3]
            with out3:
                out3.clear_output()
                print_color(((f'Removed {len(df_orig)-len(df)}/{len(df_orig)} rows (with at least {nanrem_set.value}% NaN)', 'green'),))
            try:
                if len(np.unique(df_map[line_group].dropna().values)) > 0:
                    seltissue.options = ['__all__'] + [tissue for tissue in np.unique(df_map[line_group].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[line_group] == tissue][line_col].values)) >= 1]
                    seltissue.value=['__all__']
            except:
                pass
            fc1._label.value = fc1._LBL_TEMPLATE.format(f'{fc1.selected}', 'green')
        except:
            fc1._label.value = fc1._LBL_TEMPLATE.format(f'Problem loading {fc1.selected} file ...', 'red') 
    else:
        fc1 = FileChooser(path, title='Choose CRISPR effect file', filter='*.csv')
    def fc1_change_title(fc1):
        #global df_map, df, df_orig
        try:
            df_orig = pd.read_csv(fc1.selected).rename(columns={'Unnamed: 0': 'gene'}).rename(columns=lambda x: x.split(' ')[0]).set_index('gene').T
            df = delrows_with_nan_percentage(df_orig, perc=float(nanrem_set.value))
            val.value = val.value[0],df ,df_orig, val.value[3]
            with out3:
                out3.clear_output()
                print_color(((f'Removed {len(df_orig)-len(df)}/{len(df_orig)} rows (with at least {nanrem_set.value}% NaN)', 'green'),))
            try:
                if len(np.unique(df_map[line_group].dropna().values)) > 0:
                    seltissue.options = ['__all__'] + [tissue for tissue in np.unique(df_map[line_group].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[line_group] == tissue][line_col].values)) >= 1]
                    seltissue.value=['__all__']
            except:
                pass 
            fc1._label.value = fc1._LBL_TEMPLATE.format(f'{fc1.selected}', 'green')
        except:
            fc1._label.value = fc1._LBL_TEMPLATE.format(f'Problem loading {fc1.selected} file ...', 'red') 

    fc1.register_callback(fc1_change_title)
    if modelname != '':
        fc2 = FileChooser(path, title='Choose Model file', filter='*.csv', filename=modelname, select_default=True)
        try:
            df_map = pd.read_csv(fc2.selected)
            df = val.value[1]
            val.value = val.value[0], val.value[1] ,val.value[2], df_map
            try:
                selselector.options = list(df_map.columns)
                selselector.value = line_group if line_group in selselector.options else selselector.options[0]
                if len(np.unique(df_map[selselector.value].dropna().values)) > 0:
                    seltissue.options = ['__all__'] + [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= 1]
                    seltissue.value=['__all__']
            except:
                fc2._label.value = fc2._LBL_TEMPLATE.format(f'Problem reading {fc2.selected} file ...', 'red') 
            fc2._label.value = fc2._LBL_TEMPLATE.format(f'{fc2.selected}', 'green')
        except:
            fc2._label.value = fc2._LBL_TEMPLATE.format(f'Problem loading {fc2.selected} file ...', 'red') 
    else:
        fc2 = FileChooser(path, title='Choose Model file', filter='*.csv')
    def fc2_change_title(fc2):
        #global df_map, df
        try:
            df_map = pd.read_csv(fc2.selected)
            df = val.value[1]
            val.value = val.value[0], val.value[1] ,val.value[2], df_map
            try:
                selselector.options = list(df_map.columns)
                selselector.value = line_group if line_group in selselector.options else selselector.options[0]
                if len(np.unique(df_map[selselector.value].dropna().values)) > 0:
                    seltissue.options = ['__all__'] + [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= 1]
                    seltissue.value=['__all__']
            except:
                fc2._label.value = fc2._LBL_TEMPLATE.format(f'Problem reading {fc2.selected} file ...', 'red') 
            fc2._label.value = fc2._LBL_TEMPLATE.format(f'{fc2.selected}', 'green')
        except:
            fc2._label.value = fc2._LBL_TEMPLATE.format(f'Problem loading {fc2.selected} file ...', 'red') 
    fc2.register_callback(fc2_change_title)
    Vb1 = wid.VBox([#wid.HTML(value = f"<b>NaN removal:</b>"), 
        nanrem_set, out3])
    #Vb1.box_style()
    Vb2 = wid.VBox([fc1, fc2])
    Vb3 = wid.VBox([#wid.HTML(value = f"<b>Line filtering:</b>"), 
        minline_set, selselector, wid.HBox([seltissue, selmode_button])])
    Vb4 = wid.VBox([wid.VBox([mode_buttons, #wid.HTML(value = f"<b>Labelling:</b>"), 
                    wid.HBox([button, out1])]),  out2])
    Vb5 = wid.VBox([#wid.HTML(value = f"<b>Saving:</b>"), 
                    wid.HBox([saveto_but, save_textbox, out4])])
    tabs = wid.Tab((Vb2, Vb1, Vb3 ,Vb4, Vb5))
    tabs.set_title(1, 'NaN removal')
    tabs.set_title(0, 'File input')
    tabs.set_title(2, 'Line filtering')
    tabs.set_title(3, 'Labelling')
    tabs.set_title(4, 'Saving')
    #cnt = wid.VBox([Vb1, fc1, fc2, Vb2, Vb3 ,Vb4, Vb5, out2])
    display(tabs) #display(cnt)
    return val

Observe that the CCcfs file has been subdivided into 5 separate files for storage limitations on GitHub. 

### 3. Load the input files
Load the CRISPR data and show the content.

In [21]:
from HELPpy.visualization.ui import Help_Dashboard
w = pipeline(path='../../data', 
             filename='CRISPRGeneEffect.csv', 
             modelname='Model.csv', 
             line_group='OncotreeLineage', verbose=False, percent=100)

Tab(children=(VBox(children=(FileChooser(path='/Users/maurizio/HELP/data', filename='CRISPRGeneEffect.csv', ti…

In [4]:
len(w.value[1]), w.value[0].value_counts()

(17931,
 label
 NE       16666
 E         1265
 dtype: int64)