In [None]:
# %%html
# <style>div.input{display:none} div.output_stderr{display:none}</style>

[KnowEnG Signature_Analysis_Pipeline on Github](https://github.com/KnowEnG-Research/Signature_Analysis_Pipeline)

# KnowEnG Signiture Analysis Notebook
* context of developing the notebooks_KnowEnG repository.
* running it on the notebooks.knoweng.org server.
* with a common directory structure where all user notebooks call shared pipeline src.

### Notebook widgets to set run parameters for Signiture Analysis Pipeline
* define all run parameters.
* define widget to set all run parameters.
* define a function that takes a run_file and returns controls with a display controls function and a go button.

### KnowEnG Pipelines; Yaml Files Translation: Run Relative Directory Replacements:
* results_directory
* run_directory, run_file (neither used in toolbox calls)
* tmp_directory
* /data/spreadsheets
* /data/networks

In [3]:
# %matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import io
import sys
import base64

import pandas as pd
from pandas.io.common import EmptyDataError
import numpy as np

from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
import traitlets

from knpackage import toolbox as kn

sys.path.insert(1, '../KnowEnG/Signature_Analysis_Pipeline/src')
import gene_signature_toolbox as gst
SPREADSHEETS_DIR = os.path.abspath('../KnowEnG/Signature_Analysis_Pipeline/data/spreadsheets')
NETWORKS_DIR = os.path.abspath('../KnowEnG/Signature_Analysis_Pipeline/data/spreadsheets')
YAML_DIR = os.path.abspath('../KnowEnG/Signature_Analysis_Pipeline/data/run_files')

sys.path.insert(1, '../KnowEnG/notebooks_KnowEnG/src')
from layout_notebooks import *

from   lifelines import KaplanMeierFitter
from   lifelines.statistics import logrank_test, multivariate_logrank_test, pairwise_logrank_test

results_dir = USER_RESULTS_DIRECTORY
input_data_dir = USER_DATA_DIRECTORY
clear_output()

In [4]:
# %%writefile os.path.join('../src', localize_run_parameters.py)
# """
# author: lanier4@illinois.edu
# functions to convert yaml file data source and output directory references 
# to local directory references using a python dict
# """
# import os

def yaml_append(yaml_file_name_full_path, new_key, new_value):
    """ Usage:  yaml_append(yaml_file_name_full_path, new_key, new_value)  """
    if not os.path.isfile(yaml_file_name_full_path):
        print('file not found:\n', yaml_file_name_full_path)
        return
    
    appendable_text = '\n' + new_key + ':    ' + new_value + '\n'
    print(yaml_file_name_full_path)
    try:
        with open(yaml_file_name_full_path, "a+") as yaml_file:
            yaml_file.write(appendable_text)
    except:
        print('Not Properly Appended:\n', yaml_file_name_full_path)
        pass

def get_yaml_files_list(YAML_DIR):
    """ yaml_files = get_yaml_files_list(YAML_DIR) """
    yaml_files = []
    for f in os.listdir(YAML_DIR):
        if os.path.isfile(os.path.join(YAML_DIR, f)) and f[-3:] == 'yml':
            yaml_files.append(f)
            
    return yaml_files

def display_yaml_files_list(YAML_DIR):
    yaml_files = get_yaml_files_list(YAML_DIR)
    print(YAML_DIR)
    for f in yaml_files:
        print('\t',f)

def display_all_yaml_files(YAML_DIR, yaml_files):
    n_files = len(yaml_files)
    count = 0
    for f in yaml_files:
        count += 1
        print('\n\n%30s : %s'%(f, YAML_DIR))
        print('%30s : %s '%(' "file[' + str(count-1) + ']" :~) ' + str(count), 'of ' + str(n_files)))
        run_parameters = kn.get_run_parameters(YAML_DIR, f)
        for k, v in run_parameters.items():
            print('%30s : %s'%(k,v))

def get_available_directory_names(DIR_NAME):
    """ dir_names = get_available_directory_names(DIR_NAME) """
    if not os.path.isdir(DIR_NAME):
        return []
    dir_names = []
    for maybe_dir in os.listdir(DIR_NAME):
        if os.path.isdir(maybe_dir) and maybe_dir[0] != '.':
            dir_names.append(maybe_dir)

    return dir_names

def set_local_run_parameters(run_parameters, local_dict):
    """ run_parameters = set_local_run_parameters(run_parameters, local_dict) """
    for key_name, key_value in local_dict.items():
        for k, v in run_parameters.items():
            if 'full_path' in k or 'directory' in k:
                if key_name in v:
                    de_nada, f_name = os.path.split(v)
                    run_parameters[k] = os.path.join(key_value, f_name)
                
    return run_parameters

In [5]:
# yaml_files = get_yaml_files_list(YAML_DIR)
# display_all_yaml_files(YAML_DIR, yaml_files)
display_yaml_files_list(YAML_DIR)

/Users/lanier4/git_clone/dlanier/KnowEnG/Signature_Analysis_Pipeline/data/run_files
	 BENCHMARK_1_GS_cos.yml
	 BENCHMARK_1_GS_spearman.yml
	 BENCHMARK_2_GS_net_cos.yml
	 BENCHMARK_2_GS_net_spearman.yml
	 BENCHMARK_3_GS_cc_cos.yml
	 BENCHMARK_3_GS_cc_spearman.yml
	 BENCHMARK_4_GS_cc_net_cos.yml
	 BENCHMARK_4_GS_cc_net_spearman.yml
	 TEST_1_GS_cos.yml
	 TEST_1_GS_spearman.yml
	 TEST_2_GS_net_cos.yml
	 TEST_2_GS_net_spearman.yml
	 TEST_3_GS_cc_cos.yml
	 TEST_3_GS_cc_spearman.yml
	 TEST_4_GS_cc_net_cos.yml
	 TEST_4_GS_cc_net_spearman.yml
	 zTEMPLATE_cc_net_cos.yml
	 zTEMPLATE_cc_net_spearman.yml


In [9]:
run_file_name = 'TEST_1_GS_cos.yml'
run_parameters = kn.get_run_parameters(YAML_DIR, run_file_name)
print('\t%s\n'%run_file_name)
for k, v in run_parameters.items():
    print('%30s : %s'%(k,v))

print('\nAvailable Directories:')
for d in os.listdir():
    if os.path.isdir(d) and d[0] != '.':
        print(d)

	TEST_1_GS_cos.yml

            similarity_measure : cosine
             results_directory : ./run_dir/results
                        method : similarity
                 run_directory : /Users/lanier4/git_clone/dlanier/KnowEnG/Signature_Analysis_Pipeline/data/run_files
                      run_file : TEST_1_GS_cos.yml
                 tmp_directory : ./run_dir/tmp
      signature_name_full_path : ../data/spreadsheets/TEST_1_signature.tsv
    spreadsheet_name_full_path : ../data/spreadsheets/TEST_1_gene_sample.tsv

Available Directories:
build
data
results
src
test
user_data


In [10]:
SPREADSHEETS_DIR = os.path.abspath('../KnowEnG/Signature_Analysis_Pipeline/data/spreadsheets')
NETWORKS_DIR = os.path.abspath('../KnowEnG/Signature_Analysis_Pipeline/data/spreadsheets')
YAML_DIR = os.path.abspath('../KnowEnG/Signature_Analysis_Pipeline/data/run_files')
run_dir = os.getcwd()
local_dict = {'data/spreadsheets': SPREADSHEETS_DIR, 
              'data/networks': NETWORKS_DIR, 
              'run_dir' : run_dir, 
              'data/run_files' : YAML_DIR}

In [13]:
run_parameters_lcl = set_local_run_parameters(run_parameters, local_dict)
run_parameters_lcl['accuracy_measure'] = \
    '../KnowEnG/Signature_Analysis_Pipeline/data/spreadsheets/label_validation.txt'
for k, v in run_parameters_lcl.items():
    print('%30s : %s'%(k,v))

            similarity_measure : cosine
              accuracy_measure : ../KnowEnG/Signature_Analysis_Pipeline/data/spreadsheets/label_validation.txt
             results_directory : /Users/lanier4/git_clone/dlanier/notebooks_KnowEnG/results
                        method : similarity
                 run_directory : /Users/lanier4/git_clone/dlanier/KnowEnG/Signature_Analysis_Pipeline/data/run_files/run_files
                      run_file : TEST_1_GS_cos.yml
                 tmp_directory : /Users/lanier4/git_clone/dlanier/notebooks_KnowEnG/tmp
      signature_name_full_path : /Users/lanier4/git_clone/dlanier/KnowEnG/Signature_Analysis_Pipeline/data/spreadsheets/TEST_1_signature.tsv
    spreadsheet_name_full_path : /Users/lanier4/git_clone/dlanier/KnowEnG/Signature_Analysis_Pipeline/data/spreadsheets/TEST_1_gene_sample.tsv


In [27]:
expression_name     = run_parameters_lcl["spreadsheet_name_full_path"]
signature_name      = run_parameters_lcl["signature_name_full_path"  ]
similarity_measure  = run_parameters_lcl["similarity_measure"        ]
accuracy_measure    = run_parameters_lcl["accuracy_measure"        ]

expression_df       = kn.get_spreadsheet_df(expression_name)
signature_df        = kn.get_spreadsheet_df(signature_name )

samples_names       = expression_df.columns
signatures_names    =  signature_df.columns
signatures_names    = [i.split('.')[0] for i in signatures_names]
signature_df.columns= signatures_names

similarity_mat = generate_similarity_mat(expression_df, signature_df,similarity_measure)
# similarity_mat = map_similarity_range(similarity_mat, 0)
similarity_df  = pd.DataFrame(similarity_mat, index=samples_names, columns=signatures_names)     # 37

# print(similarity_mat)
print('similarity_df\n',similarity_df)
result = similarity_df.idxmax(axis=1, skipna=True)
# benchmark = pd.read_csv('../data/spreadsheets/label_validation.txt', index_col=None, header=None, sep='\t')
# accuracy = calculate_accuracy(similarity_df, accuracy_measure)
# print(accuracy)
# print(type(result))
# print(result)
# ret_li = result.values
# print(ret_li)

[[ 1.  1.]
 [ 1.  1.]
 [ 1.  1.]
 [ 1.  1.]]
     S1   S2
E1  1.0  1.0
E2  1.0  1.0
E3  1.0  1.0
E4  1.0  1.0
<class 'pandas.core.series.Series'>
E1    S1
E2    S1
E3    S1
E4    S1
dtype: object
['S1' 'S1' 'S1' 'S1']


In [None]:
# from   sklearn.metrics.pairwise import cosine_similarity
# from   scipy.stats              import spearmanr

In [26]:
def generate_similarity_mat(expression_df, signature_df,similarity_measure):
    """generate matrix which save the similarity value of input dataframes

    Args:
        expression_df: genes x samples dataframe.
        signature_df:  genes x samples dataframe.
        
    Returns:
        similarity_mat: matrix with similarity values
    """

    genes_in_expression =  expression_df.index
    genes_in_signature  =   signature_df.index

    common_genes        = kn.find_common_node_names(genes_in_expression, genes_in_signature)
    expression_mat      = expression_df.loc[common_genes, :].values
    signature_mat       =  signature_df.loc[common_genes, :].values
    nx                  = expression_mat.shape[1]

    if   (similarity_measure == "cosine" ):
          similarity_mat      = cosine_similarity(expression_mat.T, signature_mat.T)
          # print(similarity_mat.shape)
    elif (similarity_measure == "spearman"):
          similarity_mat      = spearmanr(expression_mat, signature_mat)[0]
          # print(expression_mat)
          similarity_mat      = np.abs(similarity_mat[0:nx,nx:] )
          # print(similarity_mat.shape)
    return similarity_mat

def run_similarity(run_parameters):
    """ Performs similarity analysis and saves the similarity matrix.

    Args:
        run_parameters: parameter set dictionary.
    """

    expression_name     = run_parameters["spreadsheet_name_full_path"]
    signature_name      = run_parameters["signature_name_full_path"  ]
    similarity_measure  = run_parameters["similarity_measure"        ]
    accuracy_measure    = run_parameters["accuracy_measure"        ]

    expression_df       = kn.get_spreadsheet_df(expression_name)
    signature_df        = kn.get_spreadsheet_df(signature_name )
    
    samples_names       = expression_df.columns
    signatures_names    =  signature_df.columns
    signatures_names    = [i.split('.')[0] for i in signatures_names]
    signature_df.columns= signatures_names

    similarity_mat = generate_similarity_mat(expression_df, signature_df,similarity_measure)
    # similarity_mat = map_similarity_range(similarity_mat, 0)
    similarity_df  = pd.DataFrame(similarity_mat, index=samples_names, columns=signatures_names)     # 37
    accuracy = calculate_accuracy(similarity_df, accuracy_measure)
    print(accuracy)
#     save_final_samples_signature(similarity_df, run_parameters)
    
def read_listfile(list_file_name):
    """ pandas read a list from a file
    Args:
        list_file_name: full path name of a panda readable list file
    """
    list_file_list = pd.read_csv(list_file_name, index_col=None, header=None, sep='\t')
    return list_file_list
    
def calculate_accuracy(similarity_df, list_file_name):
    """ Calculate accuracy given similarity dataframe and benchmark result

    Args:
        similarity_df: result dataframe from run_similarity methods
    """
    result = similarity_df.idxmax(axis=1, skipna=True)
    # benchmark = read_listfile(list_file_name)
    benchmark = pd.read_csv('../data/spreadsheets/label_validation.txt', index_col=None, header=None, sep='\t')
    ret_li = result.values
    ben_li = benchmark.values.reshape((1,-1))[0]

    # common = np.sum(np.int(ret_li==ben_li) + 0.0)
    common = ret_li==ben_li
    common[common==True] = 1
    common[common==False] = 0
    accuracy = sum(common)/len(ret_li)
    return accuracy
