<a target="_blank" href="https://colab.research.google.com/github/giordamaug/HELP/blob/main/HELPpy/notebooks/gui.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://www.kaggle.com/notebooks/welcome?src=https://github.com/giordamaug/HELP/blob/main/HELPpy/notebooks/gui.ipynb">
  <img src="https://kaggle.com/static/images/open-in-kaggle.svg" alt="Open In Colab"/>
</a>

### 1. Install HELP from GitHub
Skip this cell if you already have installed HELP.

In [None]:
!pip install git+https://github.com/giordamaug/HELP.git

### 2. Download the input files
For a chosen tissue (here `Kidney`), download from GitHub the label file (here `Kidney_HELP.csv`, computed as in Example 1) and the attribute files (here BIO `Kidney_BIO.csv`, CCcfs `Kidney_CCcfs_1.csv`, ..., `Kidney_CCcfs_5.csv`, and N2V `Kidney_EmbN2V_128.csv`).  

Skip this step if you already have these input files locally.

In [None]:
tissue='Kidney'
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_HELP.csv
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_BIO.csv
for i in range(5):
  !wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_CCcfs_{i}.csv
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_EmbN2V_128.csv

In [1]:
import ipywidgets as wid
from typing import List
import matplotlib.pyplot as plt
from HELPpy.models.labelling import labelling
from HELPpy.utility.selection import select_cell_lines, delrows_with_nan_percentage
from HELPpy.utility.utils import in_notebook, pandas_readcsv, pandas_writecsv
from HELPpy.preprocess.loaders import load_features
from HELPpy.models.prediction import VotingSplitClassifier, k_fold_cv
import pandas as pd
import numpy as np
import os, glob
from ipyfilechooser import FileChooser
from HELPpy.visualization.filecollector import FileCollector
from IPython.display import HTML as html_print
from IPython.display import display
from HELPpy.visualization.plot import svenn_intesect
from typing import List, Sequence, Iterable, Optional
import fnmatch
import traceback

_LB_APPLY = 'Apply on'
_LB_DONE = 'DONE'
_LB_NANREM = 'Nan Removal'
_LB_FILTER = "Line Filtering"
_LB_LABEL = "Labelling"
_LB_SAVE = "Saving"
_LB_INPUT = "File input"
_LB_SELGENE = "Select genes files"
_LB_CNGGENE = "Change genes files"
_LB_SELATTR = "Select attribute files"
_LB_CNGATTR = "Change attribute files"
_LB_SELGENE_SUB = "Select genes to subtract"
_LB_CNGGENE_SUB = "Change genes to subtract"
_LB_SEL_LAB = "Select labelling file"
_LB_CNG_LAB = "Change labelling file"
_LB_PREPROC = "Preprocessing"
_LB_PREDICT = "Prediction"
_LB_INTERSET = "Intersection"
_LB_IDENTIFY = "Identification"
_LB_CNG_FILE1 = "Change CRISPR file"
_LB_SEL_FILE1 = "Select CRISPR file"
_LB_CNG_FILE2 = "Change Model file"
_LB_SEL_FILE2 = "Select Model file"

def file_with_ext(path, extension='.csv'):
    file_list = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(extension):
                file_list.append(file)
    return file_list

def match_item(item: str, filter_pattern: Sequence[str]) -> bool:
    """Check if a string matches one or more fnmatch patterns."""
    if isinstance(filter_pattern, str):
        filter_pattern = [filter_pattern]
    idx = 0
    found = False
    while idx < len(filter_pattern) and not found:
        found |= fnmatch.fnmatch(item.lower(), filter_pattern[idx].lower())
        idx += 1
    return found

def cstr(s, color='black'):
    return "<text style=color:{}>{}</text>".format(color, s)

def print_color(t):
    display(html_print(' '.join([cstr(ti, color=ci) for ti,ci in t])))
    
def mypipeline(path: str=os.getcwd(), savepath: str=os.getcwd(), labelpath: str=os.getcwd(), attributepath: str=os.getcwd(),
               filename: str='', modelname:str='', labelname:str='', commonlabelname:str = '',
               rows: int=5, minlines=10, percent = 100.0, 
               line_group='OncotreeLineage', line_col='ModelID', 
               verbose=False, show_progress=True):
    """
    Create a data processing pipeline.

    This function creates a data processing pipeline for handling input data, model files,
    label files, and various parameters involved in data processing. It initializes a GUI
    with widgets for user interaction and displays the processed data frames.

    :param path: Path for input file loading.
    :param savepath: Path for saving files.
    :param labelpath: Path for label files.
    :param filename: Name of the CRISPR effect input file.
    :param modelname: Pathname of the Model input file.
    :param labelname: Name of the label input file.
    :param rows: The number of rows to display in the widget for selecting tissues (default is 5).
    :param minlines: Minimum number of cell lines for tissue/lineage to be considered (default is 1).
    :param percent: Percentage of NaN allowed in genes (default is 100.0).
    :param line_group: The column in 'df_map' to use for tissue selection (default is 'OncotreeLineage').
    :param line_col: The column in 'df_map' to use for line selection (default is 'ModelID').
    :param verbose: Whether to print detailed messages (default is False).
    :param show_progress: Whether to show progress bars (default is False).

    :return: Widget containing the labeled cell lines.
    """
    tabs = wid.Tab()
    df_map = None
    df = None
    df_orig = None
    val = wid.ValueWidget()
    val.value = None, df, df_orig, df_map 
    tissue_list = []
    selector_list = []
    out01 = wid.Output()
    out02 = wid.Output()
    out1 = wid.Output()
    out2 = wid.Output()
    out3 = wid.Output()
    out4 = wid.Output()
    out6 = wid.Output()
    out70 = wid.Output()
    out71 = wid.Output()
    out72 = wid.Output()
    acd2 = wid.Accordion()
    acd6 = wid.Accordion()
    acd7 = wid.Accordion()
    # PRE-PROCESSING TAB
    nanrem_set = wid.SelectionSlider(
        options=range(0, 101),
        value=int(percent),
        description='Nan %:',
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        tooltip='set percentage of nan allowed in genes',
    )
    def nanrem_set_changed(b):
        try:
            df_orig = val.value[2]
            df = delrows_with_nan_percentage(df_orig, perc=float(nanrem_set.value))
            try:
                df_map = val.value[3]
                tissue_list = [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) 
                               if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= minline_set.value]
                seltissue.options = ['__all__'] +  tissue_list
                seltissue.value=['__all__']
                val.value = val.value[0], df[np.intersect1d(df.columns,df_map[df_map[line_group].isin(tissue_list)][line_col].values)], val.value[2], val.value[3]
            except:
                val.value = val.value[0], df, val.value[2], val.value[3]
            with out3:
                out3.clear_output()
                print_color(((f'Removed {len(df_orig)-len(df)}/{len(df_orig)} rows (with at least {nanrem_set.value}% NaN)', 'green'),))
            Vb1.set_title(0, f"{_LB_NANREM} ({nanrem_set.value})")
        except:
            pass 
    nanrem_set.observe(nanrem_set_changed, names='value')

    minline_set = wid.SelectionSlider(
        options=range(1, 100),
        value=minlines,
        description='Min lines:',
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        tooltip='set minimum number of lines for the tissue',
    )
    def minline_set_changed(b):
        try:
            tissue_list = [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= minline_set.value]
            seltissue.options = ['__all__'] +  tissue_list
            seltissue.value=['__all__']
            val.value = val.value[0], df[np.intersect1d(df.columns, df_map[df_map[line_group].isin(tissue_list)][line_col].values)], val.value[2], val.value[3]
            Vb1.set_title(1, f"{_LB_FILTER} (Lines: {minline_set.value})")
        except:
            val.value = val.value[0], df, val.value[2], val.value[3]
            with out1:
                out1.clear_output()
                print_color(((f'Problem processing map file ...', 'red'),)) 
    minline_set.observe(minline_set_changed, names='value')
    selselector = wid.Dropdown(
        options=['__all__'] + selector_list,
        value='__all__',
        description='Selector:',
        tooltip = 'select the group type of lines',
        disabled=False,
    )
    def selselector_changed(b):
        df = val.value[1]
        df_map = val.value[3]
        if selselector.value != ():
            try:
                tissue_list = [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= minline_set.value]
                seltissue.options = ['__all__'] +  tissue_list
                seltissue.value=['__all__']
                val.value = val.value[0], df[np.intersect1d(df.columns, df_map[df_map[line_group].isin(tissue_list)][line_col].values)], val.value[2], val.value[3]
            except:
                val.value = val.value[0], df, val.value[2], val.value[3]
                with out1:
                    out1.clear_output()
                    print_color(((f'Problem processing map file ...', 'red'),)) 
            with out1:
                out1.clear_output()
                display(selselector.value)
          
    selselector.observe(selselector_changed, names='value')
    seltissue = wid.SelectMultiple(
        options=['__all__'] + tissue_list if tissue_list != [] else [],
        value=['__all__'] if tissue_list != [] else [],
        rows=rows,
        description=line_group if line_group in selselector.options else '',
        tooltip = 'select lines by the chosen group',
        disabled=False
    )
    def seltissue_changed(b):
        if seltissue.value != ():
            if seltissue.value == ('__all__',):
                fname = f"{selselector.value}_all{'_mom' if selmode_button.value else ''}.csv"
                fc3._filename.value = fname
            else:
                fname = f"{selselector.value}_{'_'.join([str(s).replace(' ','-').replace('/','-') for s in seltissue.value if str(s) != '__all__'])}.csv"
                fc3._filename.value = fname
            fc3._apply_selection()
        with out1:
            out1.clear_output()
            print_color(((f"{','.join(seltissue.value)}", "orange"),))
    seltissue.observe(seltissue_changed, names='value')
    # IDENTIFICATION TAB
    pbars = wid.IntProgress(
        value=0,
        min=0,
        description='Saving:',
        bar_style='', # 'success', 'info', 'warning', 'danger' or ''
        style={'bar_color': 'violet'},
        orientation='horizontal'
    )
    pbars.layout.display = 'none'
    fc3 = FileChooser(savepath, title='Choose file', filter_pattern='*.csv', layout=wid.Layout(width='auto'))
    def fc3_change_title(fc3):
        if os.path.isfile(fc2.selected):
            fc3._label.value = fc3._LBL_TEMPLATE.format(f'{fc3.selected}', 'green')
            acd4.set_title(1, f"{_LB_SAVE} ({fc3.selected_filename})")
        else:
            fc3._label.value = fc3._LBL_TEMPLATE.format(f'{fc3.selected} not a file', 'red')
            acd4.set_title(1, f"{_LB_SAVE}")
    fc3.register_callback(fc3_change_title)
    saveto_but = wid.Button(description="Save ...", button_style='primary')
    def on_savebutton_clicked(b):
        if isinstance(val.value[0], pd.DataFrame):
            try:
                fc3._label.value = fc3._LBL_TEMPLATE.format(f'{fc3.selected}', 'orange')
                with out4:
                    out4.clear_output()
                    pandas_writecsv(fc3.selected, val.value[0], index=True)
                #val.value[0].to_csv(fc3.selected, index=True)
                fc3._label.value = fc3._LBL_TEMPLATE.format(f'{fc3.selected}', 'green')
            except Exception as e:
                fc3._label.value = fc3._LBL_TEMPLATE.format(f'Problem saving {fc3.selected}... {e}!', 'green')
        else:
            with out4:
                out4.clear_output()
                print_color(((f'Label dataframe is null (apply labelling before saving)!', 'red'),))
    saveto_but.on_click(on_savebutton_clicked)

    mode_buttons = wid.RadioButtons(
        options=["E|NE", "E|aE|sNE", "E|(aE|sNE)"],
        value='E|NE',
        description='',
        tooltips=['2 classes (one division)', '3 classes (one division)', '3 classes (two-times subdivision)'],
    )
    selmode_button = wid.Checkbox(
        value=False,
        description='Nested',
        disabled=False,
        indent=False
    )
    button = wid.Button(description=_LB_APPLY, button_style='primary')
    def on_button_clicked(b):
        df = val.value[1]
        df_map = val.value[3]
        with out1:
            out1.clear_output()
            print_color(((f'Labelling {len(df)} genes of {",".join(seltissue.value)} ...', 'orange'),))
        with out2:
            out2.clear_output()
            if seltissue.value == ('__all__',):
                selector = [x for x in seltissue.options if x != '__all__']
            else:
                selector = [x for x in seltissue.value if x != '__all__']
            cell_lines = select_cell_lines(df, df_map, selector, line_group=line_group, line_col=line_col, 
                                            nested = selmode_button.value, verbose=verbose)
            if mode_buttons.value == "E|(aE|sNE)":
                mode = 'two-by-two' 
                nclasses = 3
                labelnames = {0: 'E', 1: 'aE', 2: 'sNE'}
            else:
                mode = 'flat-multi' 
                if mode_buttons.value == "E|NE":
                    nclasses = 2
                    labelnames = {0: 'E', 1: 'NE'}
                else:
                    nclasses = 3
                    labelnames = {0: 'E', 1: 'aE', 2: 'sNE'}
            val.value = labelling(df, columns=cell_lines, mode=mode, n_classes=nclasses, labelnames=labelnames, verbose=verbose, show_progress=show_progress), df, val.value[2], val.value[3]
        with out1:
            out1.clear_output()
            print_color(((_LB_DONE, 'green'),))

    button.on_click(on_button_clicked)
    # INPUT TAB
    pbar1 = wid.IntProgress(
        value=0,
        min=0,
        description='Loading:',
        bar_style='', # 'success', 'info', 'warning', 'danger' or ''
        style={'bar_color': 'maroon'},
        orientation='horizontal'
    )
    pbar2 = wid.IntProgress(
        value=0,
        min=0,
        description='Loading:',
        bar_style='', # 'success', 'info', 'warning', 'danger' or ''
        style={'bar_color': 'purple'},
        orientation='horizontal',
    )
    pbar1.layout.display = 'none'
    pbar2.layout.display = 'none'
    if os.path.isfile(filename):
        fc1 = FileChooser(os.path.dirname(os.path.abspath(filename)), filter_pattern='*.csv', filename=os.path.basename(filename), select_default=True, layout=wid.Layout(width='auto'))
        acd2.children = (wid.HBox([fc1,out01]),)
        try:
            df_orig = pd.read_csv(fc1.selected).rename(columns={'Unnamed: 0': 'gene'}).rename(columns=lambda x: x.split(' ')[0]).set_index('gene').T
            df = delrows_with_nan_percentage(df_orig, perc=float(nanrem_set.value))
            df_map = val.value[3]
            val.value = val.value[0],df, df_orig, val.value[3]
            with out3:
                out3.clear_output()
                print_color(((f'Removed {len(df_orig)-len(df)}/{len(df_orig)} rows (with at least {nanrem_set.value}% NaN)', 'green'),))
            if df_map is not None and len(np.unique(df_map[line_group].dropna().values)) > 0:
                try:
                    tissue_list = [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= minline_set.value]
                    seltissue.options = ['__all__'] + tissue_list
                    seltissue.value=['__all__']
                    val.value = val.value[0], df[np.intersect1d(df.columns, df_map[df_map[line_group].isin(tissue_list)][line_col].values)], val.value[2], val.value[3]
                except:
                    val.value = val.value[0], df, val.value[2], val.value[3]
                    fc1._label.value = fc1._LBL_TEMPLATE.format(f'Problem reading {fc1.selected} file ...', 'red') 
            fc1._label.value = fc1._LBL_TEMPLATE.format(f'{fc1.selected}', 'green')
            acd2.set_title(0, f"{_LB_CNG_FILE1} ({fc1.selected_filename})")
        except Exception as e:
            fc1._label.value = fc1._LBL_TEMPLATE.format(f'Problem loading {fc1.selected} file ...{e}', 'red') 
            acd2.set_title(0, f"{_LB_SEL_FILE1}")
    else:
        if os.path.isdir(path):
            fc1 = FileChooser(path, filter_pattern='*.csv', layout=wid.Layout(width='auto'))
        else:
            fc1 = FileChooser(filter_pattern='*.csv', layout=wid.Layout(width='auto'))
        acd2.children = (wid.HBox([fc1,out01]),)
        acd2.set_title(0, f"{_LB_SEL_FILE1}")
    
    def fc1_change_title(fc1):
        try:
            #pbar1.layout.display = None
            with out01:
                out01.clear_output()
                df_orig = pandas_readcsv(fc1.selected)
            #pbar1.layout.display = 'none'
            df_orig = df_orig.rename(columns={'Unnamed: 0': 'gene'}).rename(columns=lambda x: x.split(' ')[0]).set_index('gene').T
            #df_orig = pd.read_csv(fc1.selected).rename(columns={'Unnamed: 0': 'gene'}).rename(columns=lambda x: x.split(' ')[0]).set_index('gene').T
            df = delrows_with_nan_percentage(df_orig, perc=float(nanrem_set.value))
            df_map = val.value[3]
            val.value = val.value[0],df ,df_orig, val.value[3]
            with out3:
                out3.clear_output()
                print_color(((f'Removed {len(df_orig)-len(df)}/{len(df_orig)} rows (with at least {nanrem_set.value}% NaN)', 'green'),))
            if df_map is not None and len(np.unique(df_map[line_group].dropna().values)) > 0:
                try:
                    tissue_list = [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= minline_set.value]
                    seltissue.options = ['__all__'] + tissue_list
                    seltissue.value=['__all__']
                    val.value = val.value[0], df[np.intersect1d(df.columns, df_map[df_map[line_group].isin(tissue_list)][line_col].values)], val.value[2], val.value[3]
                except:
                    val.value = val.value[0], df, val.value[2], val.value[3]
                    fc1._label.value = fc1._LBL_TEMPLATE.format(f'Problem reading {fc1.selected} file ...', 'red') 
            fc1._label.value = fc1._LBL_TEMPLATE.format(f'{fc1.selected}', 'green')
            acd2.set_title(0, f"{_LB_CNG_FILE1} ({fc1.selected_filename})")
        except Exception as e:
            fc1._label.value = fc1._LBL_TEMPLATE.format(f'Problem loading {fc1.selected} file ... {e}', 'red')
            acd2.set_title(0, f"{_LB_SEL_FILE1}")

    fc1.register_callback(fc1_change_title)
    if os.path.isfile(modelname):
        fc2 = FileChooser(os.path.dirname(os.path.abspath(modelname)), filter_pattern='*.csv', filename=os.path.basename(modelname), select_default=True, layout=wid.Layout(width='auto'))
        acd2.children += (wid.HBox([fc2,out02]),)
        try:
            df_map = pd.read_csv(fc2.selected)
            df = val.value[1]
            val.value = val.value[0], val.value[1] ,val.value[2], df_map
            selselector.options = list(df_map.columns)
            selselector.value = line_group if line_group in selselector.options else selselector.options[0]
            if len(np.unique(df_map[selselector.value].dropna().values)) > 0:
                try:
                    tissue_list = [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= minline_set.value]
                    seltissue.options = ['__all__'] + tissue_list
                    seltissue.value=['__all__']
                    val.value = val.value[0], df[np.intersect1d(df.columns, df_map[df_map[line_group].isin(tissue_list)][line_col].values)], val.value[2], val.value[3]
                except Exception as e:
                    val.value = val.value[0], df, val.value[2], val.value[3]
                    fc2._label.value = fc2._LBL_TEMPLATE.format(f'Problem reading {fc2.selected} file ...{e}', 'red') 
            fc2._label.value = fc2._LBL_TEMPLATE.format(f'{fc2.selected}', 'green')
            acd2.set_title(1, f"{_LB_CNG_FILE2} ({fc2.selected_filename})")
        except Exception as e:
            fc2._label.value = fc2._LBL_TEMPLATE.format(f'Problem loading {fc2.selected} file ...{e}', 'red') 
            acd2.set_title(1, f"{_LB_SEL_FILE2}")
    else:
        if os.path.isdir(path):
            fc2 = FileChooser(path, filter_pattern='*.csv', layout=wid.Layout(width='auto'))
        else:
            fc2 = FileChooser(filter_pattern='*.csv', layout=wid.Layout(width='auto'))
        acd2.children += (wid.HBox([fc2,out02]),)
        acd2.set_title(1, f"{_LB_SEL_FILE2}")
        
    def fc2_change_title(fc2):
        try:
            with out02:
                out02.clear_output()
                #pbar2.layout.display = None
                df_map = pandas_readcsv(fc2.selected)
                #pbar2.layout.display = 'none'
                #df_map = pd.read_csv(fc2.selected)
            df = val.value[1]
            val.value = val.value[0], val.value[1] ,val.value[2], df_map
            selselector.options = list(df_map.columns)
            selselector.value = line_group if line_group in selselector.options else selselector.options[0]
            if len(np.unique(df_map[selselector.value].dropna().values)) > 0:
                try:
                    tissue_list = [tissue for tissue in np.unique(df_map[selselector.value].dropna().values) if len(np.intersect1d(df.columns, df_map[df_map[selselector.value] == tissue][line_col].values)) >= minline_set.value]
                    seltissue.options = ['__all__'] + tissue_list
                    seltissue.value=['__all__']
                    val.value = val.value[0], df[np.intersect1d(df.columns, df_map[df_map[line_group].isin(tissue_list)][line_col].values)], val.value[2], val.value[3]
                except Exception as e:
                    val.value = val.value[0], df, val.value[2], val.value[3]
                    fc2._label.value = fc2._LBL_TEMPLATE.format(f'Problem reading {fc2.selected} file ...{e}', 'red') 
            fc2._label.value = fc2._LBL_TEMPLATE.format(f'{fc2.selected}', 'green')
            acd2.set_title(1, f"{_LB_CNG_FILE2} ({fc2.selected_filename})")
        except Exception as e:
            fc2._label.value = fc2._LBL_TEMPLATE.format(f'Problem loading {fc2.selected} file ...{e}', 'red') 
            acd2.set_title(1, f"{_LB_SEL_FILE2}")
    fc2.register_callback(fc2_change_title)
    # INTERSECTION TAB
    if os.path.isdir(labelpath):
        fc4 = FileCollector(labelpath, default_path=labelpath, filter_pattern='*.csv')
        acd6.children = (fc4,)
        acd6.set_title(0, f"{_LB_CNGGENE} ({os.path.basename(labelpath)})")
    else:
        fc4 = FileCollector(filter_pattern='*.csv')
        acd6.children = (fc4,)
        acd6.set_title(0, f"{_LB_SELGENE}")
    def fc4_change_title(fc4):
        if fc4.selected != ():
            acd6.set_title(0, f"{_LB_CNGGENE} ({os.path.basename(fc4.selected_path)})")
        else:
            acd6.set_title(0, f"{_LB_SELGENE}")
    fc4.register_callback(fc4_change_title)

    if os.path.isfile(commonlabelname):
        fc5 = FileChooser(os.path.dirname(os.path.abspath(commonlabelname)), filter_pattern='*.csv', 
                          filename=os.path.basename(commonlabelname), default_path=os.path.dirname(os.path.abspath(commonlabelname)), layout=wid.Layout(width='auto'))
        fc5._filename.value = os.path.basename(commonlabelname)
        fc5._apply_selection()
        acd6.children += (fc5,)
        acd6.set_title(1, f"{_LB_CNGGENE_SUB} ({os.path.basename(fc5.selected)})")
    else:
        fc5 = FileChooser(filter_pattern='*.csv', layout=wid.Layout(width='auto'))
        acd6.children += (fc5,)
        acd6.set_title(1, f"{_LB_SELGENE_SUB}")

    def fc5_change_title(fc5):
        if os.path.isfile(fc5.selected) and match_item(fc5.selected, '*.csv'):
            fc5._label.value = fc5._LBL_TEMPLATE.format(f'{fc5.selected}', 'green')
            acd6.set_title(1, f"{_LB_CNGGENE_SUB} ({os.path.basename(fc5.selected)})")
        else:
            fc5._label.value = fc5._LBL_TEMPLATE.format(f'{fc5.selected}', 'red')
            acd6.set_title(1, f"{_LB_SELGENE_SUB}")
    fc5.register_callback(fc5_change_title)

    setbut = wid.Button(description="Intersect ...", button_style='primary')
    def on_setbut_clicked(b):
        if fc4.selected == ():
            with out6:
                out6.clear_output()
                print_color(((f'No file selected!', 'orange'),))
        else:
            try:
                csEGs = []
                #for f in files.value:
                for f in fc4.selected:
                    dfl = pd.read_csv(f, index_col=0)
                    csEG = dfl[dfl['label'] == 'E'].index.values
                    if fc5.selected is not None and os.path.isfile(fc5.selected):
                        df_common = pd.read_csv(os.path.join(savepath,fc5.selected), index_col=0)
                        cEG = df_common[df_common['label']=='E'].index.values
                        csEG = np.setdiff1d(csEG, cEG)
                    csEGs += [set(csEG)]
                with out6:
                    out6.clear_output()
                    fig1, axes1 = svenn_intesect(csEGs, labels=[os.path.basename(x).split('.')[0] for x in fc4.selected], ylabel='EGs', figsize=(10,4))
                    plt.show(fig1)
            except Exception as e:
                with out6:
                    out6.clear_output()
                    print_color(((f'Problem processing label files!', 'red'),))
                    print_color(((f'{e}', 'black'),))

    setbut.on_click(on_setbut_clicked)
    # PREDICTION TAB
    if os.path.isdir(attributepath):
        fc6 = FileCollector(attributepath, default_path=attributepath, filter_pattern='*.csv', layout=wid.Layout(width='auto'))
        acd7.children += (fc6,)
        acd7.set_title(0, f"{_LB_CNGATTR} ({os.path.basename(attributepath)})")
    else:
        fc6 = FileCollector(filter_pattern='*.csv', layout=wid.Layout(width='auto'))
        acd7.children += (fc6,)
        acd7.set_title(0, f"{_LB_SELATTR}")

    def fc6_change_title(fc4):
        if fc6.selected != ():
            acd7.set_title(0, f"{_LB_CNGATTR} ({os.path.basename(fc6.selected_path)})")
        else:
            acd7.set_title(0, f"{_LB_SELATTR}")
    fc6.register_callback(fc6_change_title)

    if os.path.isfile(labelname):
        fc7 = FileChooser(os.path.dirname(os.path.abspath(labelname)), filename=os.path.basename(labelname), select_default=True, layout=wid.Layout(width='auto'))
        acd7.children += (wid.HBox([fc7,out71]),)
        try:
            fc7._label.value = fc7._LBL_TEMPLATE.format(f'{fc7.selected}', 'green')
            acd7.set_title(1, f"{_LB_CNG_LAB} ({os.path.basename(fc7.selected)})")
        except:
            fc7._label.value = fc7._LBL_TEMPLATE.format(f'Problem loading {fc7.selected} file ...', 'red') 
            acd7.set_title(1, f"{_LB_SEL_LAB}")
    else:
        if os.path.isdir(labelpath):
            fc7 = FileChooser(labelpath, filter_pattern='*.csv', layout=wid.Layout(width='auto'))
        else:
            fc7 = FileChooser(filter_pattern='*.csv', layout=wid.Layout(width='auto'))
        acd7.children += (wid.HBox([fc7,out71]),)
        acd7.set_title(1, f"{_LB_SEL_LAB}")

    def fc7_change_title(fc7):
        if os.path.isfile(fc7.selected):
            fc7._label.value = fc7._LBL_TEMPLATE.format(f'{fc7.selected}', 'green')
            acd7.set_title(1, f"{_LB_CNG_LAB} ({os.path.basename(fc7.selected)})")
        else:
            fc7._label.value = fc7._LBL_TEMPLATE.format(f'{fc7.selected}', 'red')
            acd7.set_title(1, f"{_LB_SEL_LAB}")

    fc7.register_callback(fc7_change_title)
    pbarlv = wid.IntProgress(
        value=0,
        min=0,
        description='Loading:',
        bar_style='success', # 'success', 'info', 'warning', 'danger' or ''
        style={'bar_color': 'yellow'},
        orientation='horizontal',
    )
    pbarv = wid.IntProgress(
        value=0,
        min=0,
        description='Validating:',
        bar_style='success', # 'success', 'info', 'warning', 'danger' or ''
        style={'bar_color': 'green'},
        orientation='horizontal'
    )
    valbut = wid.Button(description="Validate ...", button_style='primary')
    def on_valbut_clicked(b):
        if fc6.selected == ():
            with out70:
                out70.clear_output()
                print_color(((f'No attribute file!', 'orange'),))
        else:
            try:
                with out70:
                    out70.clear_output()
                    print_color(((f'Loading label file ...', 'orange'),))
                with out71:
                    out71.clear_output()
                    df_y = pandas_readcsv(fc7.selected, index_col=0, descr=os.path.basename(fc7.selected))
                    df_y = df_y.replace({'aE': 'NE', 'sNE': 'NE'})  # migliorare
                try:
                    with out70:
                        out70.clear_output()
                        print_color(((f'Loading attributes ...', 'orange'),))
                    with out72:
                        out72.clear_output()
                        df_X = load_features(list(fc6.selected), fixna=True, normalize='std', verbose=verbose, show_progress=show_progress)
                    try:
                        with out70:
                            out70.clear_output()
                            print_color(((f'Validating model ...', 'orange'),))
                        with out72:
                            out72.clear_output()
                            idx_common = np.intersect1d(df_y.index.values, df_X.index.values)
                            df_X = df_X.loc[idx_common]
                            df_y = df_y.loc[idx_common]
                            clf = VotingSplitClassifier(n_voters=10, n_jobs=-1, random_state=-1)
                            df_scores, scores, predictions = k_fold_cv(df_X, df_y, clf, n_splits=5, seed=0, verbose=verbose, show_progress=show_progress)
                            #df_scores, scores, predictions = ipy_k_fold_cv(df_X, df_y, clf, progressbar=pbarv, n_splits=5, seed=0)
                            #pbarv.layout.display = None
                        with out70:
                            out70.clear_output()
                            print_color(((_LB_DONE, 'green'),))
                            display(df_scores)
                    except Exception as e:
                        with out70:
                            out70.clear_output()
                            print_color(((f'Problem in validation!', 'red'),))
                            print_color(((f'{traceback.format_exc()}', 'black'),))
                            print_color(((f'{e}', 'black'),))
                except Exception as e:
                    with out70:
                        out70.clear_output()
                        print_color(((f'Problem loading/assembling attributes files!', 'red'),))
                        print_color(((f'{traceback.format_exc()}', 'black'),))
                        print_color(((f'{e}', 'black'),))
            except Exception as e:
                with out70:
                    out70.clear_output()
                    print_color(((f'Problem processing label files!', 'red'),))
                    print_color(((f'{e}', 'black'),))

    valbut.on_click(on_valbut_clicked)

    # MAIN WIDGET GUI    
    txt1 = wid.HTMLMath(
        value=r"""In this section you filter the CRIPR score lines by:
                <ol>
                  <li>removing genes with a certain percentage of missing cell line scores;</li>
                  <li>select the type of cell lines grouping (by tissue, by disease, etc.) and</li>
                  <li>filter the grous with a minimum amount of lines;</li>
                  <li>select a specific set of groups from which to extract cell line score.</li>
                </ol>""",
    )
    txt2 = wid.HTMLMath(
        value=r"""In this section you select: 
                <ol>
                    <li>the CRIPR effect file contanin cell lines scores, and
                    <li>the Model file mapping cell line names to tissues/diseases,etc. 
                </ol>
                NOTE: the selected file is loaded when the file path appears in green text.""",
    )
    txt6 = wid.HTMLMath(
        value=r"""In this section you can intersect contet specific EGs from different tissues/diseases:
                <ol>
                  <li>select the directory where are the label files of context-specific genes;</li>
                  <li>select the an option label file representing EGs you want to exclude from intersection</li>
                  <li>apply intersection an display the resulting Super Venn diagram.</li>
                </ol>""",
    )
    txt4 = wid.HTMLMath(
        value=r"""In this section you can compute the labelling of a set of gene by setting dome parameter: 
                <ol>
                  <li>the type of labelling: binary (E|NE), ternary (E|aE|sNE), or binary amd then binary in the second class (E|(aE|sNE));</li>
                  <li>the separation algorithm (Otsu is the default)</li>
                </ol>
                The labelling results can be saved in a CSV file for future use.<p>
                NOTE: the labelling process is complete once you see a green-colored "DONE".""",
    )
    txt7 = wid.HTMLMath(
        value=r"""In this section you can make prediction with model trained on labelling files: 
                  <ol>
                  <li>In the first widget you can select the files used as feature inputs for the
                  builfding model.</li>
                  <li>
                  In the second widget you can select the label file used for training the model</li>
                  </ol>
                  NOTE: the labelling process is complete once you see a green-colored "DONE"."""
    )
    Vb2 = wid.VBox([txt2, acd2])
    acd1 = wid.Accordion(children=[wid.VBox([nanrem_set, out3]), wid.VBox([minline_set, selselector, wid.HBox([seltissue, selmode_button])])])
    acd1.set_title(0, f"{_LB_NANREM} ({percent}%)")
    acd1.set_title(1, f"{_LB_FILTER} (Lines: {minline_set.value})")
    Vb1 = wid.VBox([txt1, acd1])
    acd4 = wid.Accordion(children=[wid.VBox([wid.HBox([mode_buttons, button, out1]), out2]), wid.VBox([wid.HBox([fc3, saveto_but]), out4])])
    acd4.set_title(0, f"{_LB_LABEL} ({mode_buttons.value})")
    acd4.set_title(1, f"{_LB_SAVE}") if fc3.selected_filename == "" else acd4.set_title(1, f"{_LB_SAVE} ({fc3.selected_filename})")
    Vb4 = wid.VBox([txt4, acd4])
    Vb6 = wid.VBox([txt6, wid.HBox([acd6, wid.VBox([setbut, out6])])])
    Vb7 = wid.VBox([txt7, wid.HBox([acd7,wid.VBox([wid.HBox([valbut, out70]), out72])])])
    tabs.children = [Vb2, Vb1, Vb4, Vb6, Vb7]
    tabs.set_title(0, f'{_LB_INPUT}')
    tabs.set_title(1, f'{_LB_PREPROC}')
    tabs.set_title(2, f'{_LB_IDENTIFY}')
    tabs.set_title(3, f'{_LB_INTERSET}')
    tabs.set_title(4, f'{_LB_PREDICT}')
    display(tabs)
    return val

Observe that the CCcfs file has been subdivided into 5 separate files for storage limitations on GitHub. 

### 3. Organize you pipeline by a Widget
in this cell you launch a user interface wdiget to 

In [2]:
from HELPpy.visualization.ui import pipeline
w = mypipeline(path='../../data', savepath='../../newdata', labelpath='../../data', attributepath='../../data',
             filename="../../data/CRISPRGeneEffect.csv", 
             modelname="../../data/Model.csv",
             labelname='../../data/Lung_HELP.csv', 
             commonlabelname='../../data/PanTissue.csv', 
             line_group='OncotreeLineage', verbose=False, show_progress=True, percent=80)

Tab(children=(VBox(children=(HTMLMath(value='In this section you select: \n                <ol>\n             …

In [13]:
df_X.loc[df_X.index.isin(df_y.index.values)]

Unnamed: 0_level_0,Gene length,Transcripts count,GC content,GTEX_kidney,Gene-Disease association,OncoDB_expression,HPA_kidney,GO-MF,GO-BP,GO-CC,...,GO.0098794,GO.0098793,GO.0098799,GO.0098791,GO.0001931,GO.0015030,GO.0098978,GO.0005579,GO.0008541,GO.0098798
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.003351,0.020942,0.501832,2.044542e-05,0.002950,0.651558,0.000002,0.082619,0.040702,0.115385,...,0.005490,0.002220,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.005695
A1CF,0.034865,0.047120,0.160530,1.980884e-05,0.023356,0.556939,0.000232,0.069767,0.041026,0.096154,...,0.000261,0.000184,0.0,0.001010,0.000000,0.165360,0.004055,0.000000,0.0,0.000000
A2M,0.019624,0.062827,0.176932,3.377232e-03,0.073746,0.584540,0.005382,0.302326,0.056410,0.076923,...,0.039004,0.018676,0.0,0.007586,0.000000,0.001357,0.036475,0.058456,0.0,0.000237
A2ML1,0.026017,0.041885,0.299948,5.123403e-07,0.017699,0.651558,0.000000,0.069767,0.005128,0.038462,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
A3GALT2,0.005784,0.000000,0.473739,1.421472e-06,0.023356,0.663540,0.000000,0.069767,0.015385,0.057692,...,0.000000,0.000000,0.0,0.026899,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.021209,0.010471,0.288257,7.073108e-06,0.023356,0.634761,0.000055,0.082619,0.040702,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
ZYG11B,0.040775,0.005236,0.248648,7.271294e-05,0.023356,0.646090,0.000238,0.000000,0.005128,0.000000,...,0.002883,0.003607,0.0,0.033041,0.000000,0.028038,0.003279,0.000000,0.0,0.000000
ZYX,0.003958,0.047120,0.539522,8.282866e-04,0.023356,0.672638,0.000177,0.046512,0.035897,0.153846,...,0.039631,0.011731,0.0,0.019659,0.087284,0.020391,0.052937,0.000000,0.0,0.000000
ZZEF1,0.056017,0.052356,0.304484,9.626291e-05,0.023356,0.651558,0.000121,0.093023,0.040702,0.078727,...,0.000742,0.002739,0.0,0.004421,0.000000,0.017462,0.001196,0.000000,0.0,0.012408


In [18]:
df_X

Unnamed: 0_level_0,Gene length,Transcripts count,GC content,GTEX_kidney,Gene-Disease association,OncoDB_expression,HPA_kidney,GO-MF,GO-BP,GO-CC,...,GO.0098794,GO.0098793,GO.0098799,GO.0098791,GO.0001931,GO.0015030,GO.0098978,GO.0005579,GO.0008541,GO.0098798
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.003351,0.020942,0.501832,2.044542e-05,0.002950,0.651558,0.000002,0.082619,0.040702,0.115385,...,0.005490,0.002220,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.005695
A1CF,0.034865,0.047120,0.160530,1.980884e-05,0.023356,0.556939,0.000232,0.069767,0.041026,0.096154,...,0.000261,0.000184,0.0,0.001010,0.000000,0.165360,0.004055,0.000000,0.0,0.000000
A2M,0.019624,0.062827,0.176932,3.377232e-03,0.073746,0.584540,0.005382,0.302326,0.056410,0.076923,...,0.039004,0.018676,0.0,0.007586,0.000000,0.001357,0.036475,0.058456,0.0,0.000237
A2ML1,0.026017,0.041885,0.299948,5.123403e-07,0.017699,0.651558,0.000000,0.069767,0.005128,0.038462,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
A3GALT2,0.005784,0.000000,0.473739,1.421472e-06,0.023356,0.663540,0.000000,0.069767,0.015385,0.057692,...,0.000000,0.000000,0.0,0.026899,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.021209,0.010471,0.288257,7.073108e-06,0.023356,0.634761,0.000055,0.082619,0.040702,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
ZYG11B,0.040775,0.005236,0.248648,7.271294e-05,0.023356,0.646090,0.000238,0.000000,0.005128,0.000000,...,0.002883,0.003607,0.0,0.033041,0.000000,0.028038,0.003279,0.000000,0.0,0.000000
ZYX,0.003958,0.047120,0.539522,8.282866e-04,0.023356,0.672638,0.000177,0.046512,0.035897,0.153846,...,0.039631,0.011731,0.0,0.019659,0.087284,0.020391,0.052937,0.000000,0.0,0.000000
ZZEF1,0.056017,0.052356,0.304484,9.626291e-05,0.023356,0.651558,0.000121,0.093023,0.040702,0.078727,...,0.000742,0.002739,0.0,0.004421,0.000000,0.017462,0.001196,0.000000,0.0,0.012408


In [19]:
attrfiles = ["/Users/maurizio/HELP/data/Kidney_BIO.csv", "/Users/maurizio/HELP/data/Kidney_CCBeder.csv"]
df_y = pandas_readcsv("/Users/maurizio/HELP/newdata/Kidney.csv", index_col=0)
df_y = df_y.replace({'aE': 'NE', 'sNE': 'NE'})  # migliorare
df_X = load_features(list(attrfiles), fixna=True, normalize='std', verbose=True, show_progress=True)
idx_common = np.intersect1d(df_y.index.values, df_X.index.values)
df_X = df_X.loc[idx_common]
df_y = df_y.loc[idx_common]
clf = VotingSplitClassifier(n_voters=10, n_jobs=-1, random_state=-1)
df_scores, scores, predictions = k_fold_cv(df_X, df_y, clf, n_splits=5, seed=0, verbose=True, show_progress=True)


  0%|          | 0/17829 [00:00<?, ?it/s]

Kidney_BIO.csv:   0%|          | 0/19293 [00:00<?, ?it/s]

[Kidney_BIO] found 52532 Nan...
[Kidney_BIO] Fixing NaNs with mean ...
[Kidney_BIO] Normalization with std ...


Kidney_CCBeder.csv:   0%|          | 0/19298 [00:00<?, ?it/s]

[Kidney_CCBeder] found 1161570 Nan...
[Kidney_CCBeder] Fixing NaNs with mean ...
[Kidney_CCBeder] Normalization with std ...
{'E': 0, 'NE': 1}
label
NE       15994
E         1242
Name: count, dtype: int64
Classification with VotingSplitClassifier...


5-fold:   0%|          | 0/5 [00:00<?, ?it/s]

In [30]:
import csv
from tqdm.notebook import tqdm
def mypandas_readcsv(filename, chunksize=50, sep=',', index_col=None, comment='#', descr:str=None, disabled=False):
    # Get number of lines in file.
    with open(filename, 'r') as fp:
        try:
            has_headings = csv.Sniffer().has_header(fp.read(1024))
            lines = len(fp.readlines())-1
        except csv.Error:
            # The file seems to be empty
            lines = len(fp.readlines())
    # Read file in chunks, updating progress bar after each chunk.
    listdf = []
    with tqdm(total=lines, desc=descr, disable=disabled) as bar:
        for i,chunk in enumerate(pd.read_csv(filename,chunksize=chunksize, index_col=index_col, comment=comment, sep=sep)):
            listdf.append(chunk)
            bar.update(chunksize)
    df = pd.concat(listdf,ignore_index=False)
    return df

def split_dataframe(df, chunk_size = 10): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

def mypandas_writecsv(filename, df: pd.DataFrame, chunksize=10, index=False, sep=',', descr:str=None, disabled=False):
    # Write file in chunks, updating progress bar after each chunk.
    if len(df) > chunksize + 1:
        num_chunks = len(df) // chunksize + 1
    else:
        num_chunks = 1
    print(num_chunks)
    with tqdm(total=num_chunks, desc=descr, disable=disabled) as bar:
        for i, chunk in enumerate(split_dataframe(df, chunksize)):
            mode = 'w' if i == 0 else 'a'
            chunk.to_csv(filename, index=index, mode=mode, sep=sep)
            bar.update()
    return df

mypandas_writecsv("prova.csv", w.value[0], index=True, chunksize=1000000, descr='Saving')
#split_dataframe(w.value[0])

1


Saving:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0_level_0,label
gene,Unnamed: 1_level_1
A1BG,NE
A1CF,NE
A2M,NE
A2ML1,NE
A3GALT2,NE
...,...
ZYG11A,NE
ZYG11B,NE
ZYX,NE
ZZEF1,NE


In [35]:
from HELPpy.utility.utils import pandas_readcsv
from sklearn.preprocessing import MinMaxScaler, StandardScaler
def load_features(filenames: List[str] = [], fixna=False, normalize=False, colname: str="label", 
                  verbose: bool = False, show_progress: bool = False) -> pd.DataFrame:
    """
    Load and assemble features and labels for machine learning tasks.

    :param List[str] features: List of feature filepaths
    :param str colname: Name of the column in the label file to be used as the target variable. Default is "label".
    :param int seed: Random seed for reproducibility. Default is 1.
    :param bool verbose: Whether to print verbose messages during processing. Default is False.
    :param bool show_progress: Whether to print progress bar while loading file. Default is False.

    :returns: Tuple containing the assembled features (X) and labels (Y) DataFrames.
    :rtype: Tuple[pd.DataFrame, pd.DataFrame]
        
    :example:

    .. code-block:: python

        colname = "target_column"
        seed = 1
        verbose = False

        df_label = pd.read_csv("label_file.csv2, index_col=0)
        X, Y = load_features(['path/to/feature_file1.csv', 'path/to/feature_file2.csv'], fix_na=True, colname, seed, verbose)
    """

    # Common indices among labels and features
    x = pd.DataFrame()

    # Process each feature file
    for f in filenames:
        feat_df = pandas_readcsv(f, index_col=0, descr=os.path.basename(os.path.basename(f)))
        feat_df.index = feat_df.index.map(str)
        fname = os.path.basename(f).rsplit('.', 1)[0]

        # Handle missing values if required
        if verbose:
            cntnan = feat_df.isna().sum().sum()
            print(f"[{fname}] found {cntnan} Nan...")
        if fixna:
            if verbose:
                print(f"[{fname}] Fixing NaNs with mean ...")
            feat_df = feat_df.fillna(feat_df.mean())

        # Normalize features
        if normalize == 'std':
            scaler = MinMaxScaler()
            if verbose:
                print(f"[{fname}] Normalization with {normalize} ...")
            feat_df = pd.DataFrame(scaler.fit_transform(feat_df), index=feat_df.index, columns=feat_df.columns)
        elif normalize == 'max':
            scaler = StandardScaler()
            if verbose:
                print(f"[{fname}] Normalization with {fnormalize}...")
            feat_df = pd.DataFrame(scaler.fit_transform(feat_df), index=feat_df.index, columns=feat_df.columns)
        else:
            if verbose:
                print(f"[{fname}] No normalization...")

        # merge features features
        x = pd.merge(x, feat_df, left_index=True, right_index=True, how='outer')

    # Return the assembled features (X) and labels (Y)
    return x

os.path.basename("/Users/maurizio/HELP/data/Kidney_BIO.csv")
load_features(["/Users/maurizio/HELP/data/Kidney_BIO.csv", "/Users/maurizio/HELP/data/Kidney_CCBeder.csv"], fixna=True, normalize='std', verbose=True, show_progress=True)

Kidney_BIO.csv:   0%|          | 0/19293 [00:00<?, ?it/s]

[Kidney_BIO] found 52532 Nan...
[Kidney_BIO] Fixing NaNs with mean ...
[Kidney_BIO] Normalization with std ...


Kidney_CCBeder.csv:   0%|          | 0/19298 [00:00<?, ?it/s]

[Kidney_CCBeder] found 1161570 Nan...
[Kidney_CCBeder] Fixing NaNs with mean ...
[Kidney_CCBeder] Normalization with std ...


Unnamed: 0_level_0,Gene length,Transcripts count,GC content,GTEX_kidney,Gene-Disease association,OncoDB_expression,HPA_kidney,GO-MF,GO-BP,GO-CC,...,GO.0098794,GO.0098793,GO.0098799,GO.0098791,GO.0001931,GO.0015030,GO.0098978,GO.0005579,GO.0008541,GO.0098798
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.003351,0.020942,0.501832,2.044542e-05,0.002950,0.651558,0.000002,0.082619,0.040702,0.115385,...,0.005490,0.002220,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.005695
A1CF,0.034865,0.047120,0.160530,1.980884e-05,0.023356,0.556939,0.000232,0.069767,0.041026,0.096154,...,0.000261,0.000184,0.0,0.001010,0.000000,0.165360,0.004055,0.000000,0.0,0.000000
A2M,0.019624,0.062827,0.176932,3.377232e-03,0.073746,0.584540,0.005382,0.302326,0.056410,0.076923,...,0.039004,0.018676,0.0,0.007586,0.000000,0.001357,0.036475,0.058456,0.0,0.000237
A2ML1,0.026017,0.041885,0.299948,5.123403e-07,0.017699,0.651558,0.000000,0.069767,0.005128,0.038462,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
A3GALT2,0.005784,0.000000,0.473739,1.421472e-06,0.023356,0.663540,0.000000,0.069767,0.015385,0.057692,...,0.000000,0.000000,0.0,0.026899,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.021209,0.010471,0.288257,7.073108e-06,0.023356,0.634761,0.000055,0.082619,0.040702,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
ZYG11B,0.040775,0.005236,0.248648,7.271294e-05,0.023356,0.646090,0.000238,0.000000,0.005128,0.000000,...,0.002883,0.003607,0.0,0.033041,0.000000,0.028038,0.003279,0.000000,0.0,0.000000
ZYX,0.003958,0.047120,0.539522,8.282866e-04,0.023356,0.672638,0.000177,0.046512,0.035897,0.153846,...,0.039631,0.011731,0.0,0.019659,0.087284,0.020391,0.052937,0.000000,0.0,0.000000
ZZEF1,0.056017,0.052356,0.304484,9.626291e-05,0.023356,0.651558,0.000121,0.093023,0.040702,0.078727,...,0.000742,0.002739,0.0,0.004421,0.000000,0.017462,0.001196,0.000000,0.0,0.012408


In [31]:
len(np.intersect1d(pd.read_csv("/Users/maurizio/HELP/data/Kidney_BIO.csv", index_col=0).index.values, pd.read_csv("/Users/maurizio/HELP/data/Kidney_CCBeder.csv", index_col=0).index.values))

19298

In [2]:
pbarlv = wid.IntProgress()
df_y = ipy_readcsv("/Users/maurizio/HELP/data/Lung_HELP.csv", index_col=0, progressbar=pbarlv)
features = [{'fname': '/Users/maurizio/HELP/data/Kidney_BIO.csv', 'fixna' : False, 'normalize': 'std'}]
df_X, df_y = ipy_feature_assemble_df(df_y, features=features, verbose=True, show_progress=True, progressbar=pbarlv)


[Kidney_BIO.csv] found 52532 Nan...
[Kidney_BIO.csv] Normalization with std ...
17236 labeled genes over a total of 17931
(17236, 26) data input


In [3]:
len(w.value[1]), w.value[0].value_counts()

(17931,
 label
 NE       16683
 E         1248
 dtype: int64)

In [13]:
%pwd

'/Users/maurizio/HELP/HELPpy/utility'

In [17]:
import sys
sys.path.insert(0, '.')
from myfilechooser import MyFileChooser
MyFileChooser("../../data")

TraitError: The 'value' trait of a SelectMultiple instance expected a tuple, not the NoneType None.