[Signature Analysis Pipeline - Readme on github](https://github.com/KnowEnG-Research/Signature_Analysis_Pipeline/blob/master/README.md)
## Click on the >| Run button above to run a cell.
    * Run the cells in order (re-running the same cell is ok).
    * Kernel > Restart & Clear Output (to clear errors and start over).
    * Note that the path names in the yaml files must be edited even to run with the pipeline defaults.


# KnowEnG 's Signiture Analysis Pipeline notebook
    (0) (optional upload your data files: spreadsheets, network, yaml).
    1) Select and Edit the run parameters from the pipeline data/run_files.
    2) Run the yaml file to put the output in the results directory.
    3) View and download your results.

In [1]:
import os
import sys

import knpackage.toolbox as kn

sys.path.insert(1, '../Signature_Analysis_Pipeline/src')
import gene_signature_toolbox as gsa_tbx


yaml_files_path = '../Signature_Analysis_Pipeline/data/run_files'
spreadsheet_files_path = '../Signature_Analysis_Pipeline/data/spreadsheets'
network_files_path = '../Signature_Analysis_Pipeline/data/networks'

def get_run_parameters(yaml_file_name, yaml_files_path=yaml_files_path):
    """ use knoweng package code to get the run parameters """
    return kn.get_run_parameters(yaml_files_path, yaml_file_name)

def disp_run_parameters(run_parameters):
    """ formateed display of the run_parameters dict """
    for k, v in run_parameters.items():
        print('%25s : %s'%(k, v))

def disp_yaml_file(yaml_file_name, yaml_files_path=yaml_files_path):
    """ open and display a yaml file """
    disp_run_parameters(get_run_parameters(yaml_file_name, yaml_files_path))
    
def view_directory(dir_name):
    """ view the contents of a directory """
    if os.path.isdir(dir_name):
        dir_list = os.listdir(dir_name)
        for maybe_file in dir_list:
            if os.path.isfile(os.path.join(dir_name, maybe_file)):
                print('\t',maybe_file)
    else:
        print('directory not found:\n',dir_name)

def get_full_path_name(file_name, path_name):
    return os.path.join(path_name, file_name)


import pandas as pd
from IPython.display import display

def view_results_spreadsheet(result_file_name, results_path, first_row=0, last_row=10):
    """ view a spreadsheeet """
    start_row = min(first_row, last_row)
    fin_row = max(first_row, last_row)
    if start_row + 1 > fin_row:
        return
    if os.path.isfile(os.path.join(results_path, result_file_name)):
        spreadsheet_name_full_path = get_full_path_name(result_file_name, results_path)
        spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
        number_of_rows = spreadsheet_df.shape[0]
        if number_of_rows < 1:
            return
        elif fin_row <= number_of_rows and start_row >= 0:
            display(spreadsheet_df.iloc[start_row:fin_row,:])
        elif fin_row <= number_of_rows:
            display(spreadsheet_df.iloc[0:fin_row,:])
        else:
            display(spreadsheet_df.iloc[0:number_of_rows,:])
    else:
        print('Data Frame File Not Found:\n', os.path.join(results_path, result_file_name))


user_data_path = 'user_data'
results_path = 'results'


print('run parameter (yaml) files available for this pipeline:\n')
view_directory(yaml_files_path)

run parameter (yaml) files available for this pipeline:

	 TEST_4_GS_cc_net_spearman.yml
	 TEST_4_GS_cc_net_cos.yml
	 zTEMPLATE_cc_net_spearman.yml
	 TEST_3_GS_cc_cos.yml
	 BENCHMARK_4_GS_cc_net_spearman.yml
	 BENCHMARK_3_GS_cc_cos.yml
	 BENCHMARK_1_GS_cos.yml
	 BENCHMARK_4_GS_cc_net_cos.yml
	 TEST_3_GS_cc_spearman.yml
	 zTEMPLATE_cc_net_cos.yml
	 TEST_2_GS_net_cos.yml
	 BENCHMARK_3_GS_cc_spearman.yml
	 TEST_2_GS_net_spearman.yml
	 BENCHMARK_2_GS_net_spearman.yml
	 TEST_1_GS_spearman.yml
	 BENCHMARK_1_GS_spearman.yml
	 TEST_1_GS_cos.yml
	 BENCHMARK_2_GS_net_cos.yml


## (optional) Upload your files to the user data directory:
    * Select File > Open from the jupyter menu above.
    * Switch to the **user_data** directory by clicking on it.
    * In the Upper Right of the directory page click on upload, and find the file on your computer.
    * You will have to click on the highlighted "upload" button (or "cancel") to start the upload.
    
### (Edit one of the yaml files as a starting point to run your data).

## 1) Choose a yaml file, display the run_parameters dictionary.

In [None]:
# Choose a yaml file
yaml_file_name = 'BENCHMARK_2_GS_net_cos.yml'

run_parameters = get_run_parameters(yaml_file_name, yaml_files_path)
disp_run_parameters(run_parameters)

### 1) Edit and display until run_parameters are correct.

In [None]:
# edit and display:
run_parameters['results_directory'] = 'results'
run_parameters['tmp_directory'] = 'results'
run_parameters['run_directory'] = '.'

run_parameters['gg_network_name_full_path'] = get_full_path_name('keg_ST90_4col.edge', network_files_path)
run_parameters['signature_name_full_path'] = get_full_path_name('Hsap.nbs_UCEC.G.gene_som_mut.binary.a.df', 
                                                                spreadsheet_files_path)
run_parameters['spreadsheet_name_full_path'] = get_full_path_name('Hsap.nbs_OV.G.gene_som_mut.binary.a.df', 
                                                                spreadsheet_files_path)

run_parameters['rwr_restart_probability'] = 0.5
disp_run_parameters(run_parameters)

## 2) Run the pipeline with the edited run_parameters.

In [None]:
if run_parameters['method'] == 'similarity':
    gsa_tbx.run_similarity(run_parameters)
elif run_parameters['method'] == 'cc_similarity':
    gsa_tbx.run_cc_similarity(run_parameters)
elif run_parameters['method'] == 'net_similarity':
    gsa_tbx.run_net_similarity(run_parameters)
elif run_parameters['method'] == 'cc_net_similarity':
    gsa_tbx.run_cc_net_similarity(run_parameters)

## 3) View the file:
    * Run the next cell to see the list of files in the results directory.
    * Enter the name of a file to view with the next cell.
        * (Only the first 10 rows will be displayed)

In [4]:
# view the results directory and choose a file to view

result_file_name = 'Enter_the_file_name_only_here'

os.listdir(results_path)

['result_net_similarity_cosine_Tue_28_Nov_2017_20_47_08.010686635_viz.tsv',
 'result_net_similarity_cosine_Tue_28_Nov_2017_20_30_55.845087289_viz.tsv',
 '.ipynb_checkpoints',
 'result_net_similarity_cosine_Tue_28_Nov_2017_21_15_41.129926681_viz.tsv',
 'result_net_similarity_cosine_Tue_28_Nov_2017_20_58_51.706730842_viz.tsv']

In [5]:
# display the spreadsheet if the name is entered properly.
# result_file_name = 'result_net_similarity_cosine_Tue_28_Nov_2017_21_15_41.129926681_viz.tsv'
first_row = 0
max_number_to_display = 20000
last_row = first_row + max_number_to_display
view_results_spreadsheet(result_file_name ,results_path, first_row, last_row)

Data Frame File Not Found:
 results/Enter_the_file_name_only_here


### 3) Download the file:
    * Select File - Open - results from the jupyter menu.
    * Click the checkbox next to the file name (only one at a time).
    * Click download to copy the file to your downloads directory.