# Setup

In [12]:
import os
import sys
#import utils
import pandas as pd



from pypathway import GMTUtils

import importlib
# import pebba_python
# importlib.reload(pebba_python)

# from pebba_python import *


import numpy as np
from pypathway import ORA

In [3]:
file_in = "data/GSE49757_Septic_vs_Healthy.txt"
gmt_file = "data/Reactome_2016_15and100Genes.gmt"

# funcoes pebba

In [4]:
def pebba(file_in, 
          gmt_file, 
          gene_col="Gene.symbol",
          logFC_col="logFC",
          pvalue_col="P.Value",
          min_genes=100,
          max_genes=1500,
          p_cut=0.2,
          verbose=True,
          analysis_name= None, 
          results_dir="Results",
          force=False):

    validates_inputs(min_genes,max_genes,p_cut)

    create_results_directory(results_dir,force)

    

    
    ## Get information from all unique terms
    
    term2gene , path_desc , merge_p  = read_gmt_hier(gmt_file) #utils.read_gmt_hier(gmt_file)    

# alterei funcao pra retornar o merge_p, 
# ta retornando no formato de vetor mas talvez tenha q mudar para dataframe ou serie
    
    
    
    if( isinstance(file_in , str)):
        deg_list = pd.read_csv(file_in,  sep = "\t")#header true no original, so tirei o header pra o python inferir
        if(analysis_name is None):
            analysis_name =   os.path.splitext(os.path.basename(file_in) )[0] # pega o basename e tira a extensao
     
    elif(isinstance(file_in , pd.DataFrame)):
        deg_list = file_in
    

    if(analysis_name is None):
        analysis_name = "PEBBA_analysis"
    
    ## Remove rows that do not have a valid gene symbol
    deg_list = deg_list.dropna()
    ## Get background genes as a character vector
    ## Empty values (non-annotated genes) will be removed
    all_genes = deg_list["Gene.symbol"] # passar pra lista?
    
    # Get cutoff values -------------------------------------------------------
    if(verbose):
        print("Getting cutoff") # no original era message(), posso usar um print?
        
    table_cut = _get_cutoff(deg_list, logFC_col, pvalue_col, min_genes, max_genes)
    
    directions = ["up", "down" , "any"]
    
    cut_paths =[]
    dfs = []
    paths = []
    for direction in directions:
        if (verbose):
            print(direction + "\nGetting Pathways")
        df , path = _get_pathway(merge_p, term2gene, all_genes,
                            deg_list, gene_col, logFC_col,
                            pvalue_col, direction,
                            min_genes, max_genes, p_cut)
        if (verbose):
            print("Getting Pathway Cutoff")
        
        cut_path = _cutoff_path(path, p_cut, direction)
        
        cut_paths.append(cut_path)
        dfs.append(df)
        paths.append(path)
    
    
    ## Save heatmaps
    if(verbose):
        print("Saving heatmaps")
    

In [5]:
def validates_inputs(min_genes,max_genes,p_cut):

    if(min_genes < 50 or min_genes > 2900):
        sys.exit("Variable min_genes must be between 50 and 2900 genes")
          
    if(max_genes < 100 or max_genes > 3000):
        sys.exit("Variable max_genes must be between 100 and 3000 genes")  
    
    if(p_cut < 0.00001 or p_cut > 1):
        sys.exit("Variable p_cut must be between 0.00001 and 1")       

In [6]:
def create_results_directory(results_dir,force):
    results_dir = os.path.abspath(results_dir)
    if not os.path.exists(results_dir): 
        os.makedirs("Results/Tables") 
        os.makedirs("Results/Heatmaps")      
    else:    
        if( not force): 
            sys.exit("Stopping analysis: ", results_dir, " already exists! Use force=True to overwrite.")

# read_gmt

In [7]:

def read_gmt_hier(file_name):
    '''
    file_name string -> dataframe and dictionary
    
    Reads .gmt file and returns a Pandas DataFrame and a dictionary with the information of the gmt file in a format ready to be used.
    
    '''
    gmt_names = []
    gmt_desc  = []
    gmt_genes = []
    res =pd.DataFrame()

    with open(file_name, 'r') as f:
        # separar cada elemento separado por tab e guardar eles
        for line in f:
            gmt_names.append(line.split("\t")[0])
            gmt_desc.append(line.split("\t")[1])
            gmt_genes.append(line.split("\t")[2:])

    for i in range(len(gmt_genes)) :

        # apagar \n presente no ultimo gene de cada lista (artefato da leitura do arquivo)
        gmt_genes[i][-1] = gmt_genes[i][-1].replace("\n", "")

        # Poem na forma de um dataframe, cada linha um gene e suas informações relativas 
        temp= pd.DataFrame({'term': [gmt_names[i]]*len(gmt_genes[i]), 'hier':  [gmt_desc[i]]*len(gmt_genes[i]), 'gene' : gmt_genes[i] })
        res = pd.concat([res,temp])

    # reseta o indice
    res = res.reset_index(drop=True)    

    #relação entre nomes e descricões  (é pra isso q essa variavel serve? no original é um datafra esquisito, achei q assim ia ser mais otimizado)   
    path_desc = dict(zip(gmt_names,gmt_desc))

    return res, path_desc , gmt_names


# copy pastes e testes aleatorios

In [55]:
#!/usr/bin/env python
# coding: utf-8



def _get_cutoff(deg_list, logFC_col, pvalue_col, min_genes, max_genes):
    
    dirs = ["down", "up"]
    
    res_up = _get_directional_cutoff("up",deg_list, logFC_col, pvalue_col, min_genes, max_genes)
    res_down = _get_directional_cutoff("down",deg_list, logFC_col, pvalue_col, min_genes, max_genes)
    
    res = res_down.merge(res_up,on ="TopCut", how="outer" , suffixes = ("_down","_up"))
    
    
    res["minimum_log2fc_combined"] = res[['minimum_log2fc_down','minimum_log2fc_up']].min(axis=1)
    res["minimum_MinuslogP_combined"] = res[['minimum_MinuslogP_down','minimum_MinuslogP_up']].min(axis=1) 
    res["minimum_Pi_combined"] = res[['minimum_Pi_down','minimum_Pi_up']].min(axis=1)
    
    
    return res


# In[58]:


def _get_directional_cutoff(direction,deg_list, logFC_col, pvalue_col, min_genes, max_genes):
    if (direction == "down"):
        ascending = True # no original, decreasing = False
    else:
        ascending = False
            
    #pega o deg_list e ordena de maneira decrescente ou crescente usando o logFC_col como chave
    # ai ele pega só as n=max genes primeiras linhas e retorna os valores de logFC
    top = deg_list.sort_values(by = logFC_col, ascending = ascending)[[logFC_col , pvalue_col]]
    top = top[:max_genes]
    
    top["pi_value"] = top[logFC_col].apply(abs) * (- top[pvalue_col].apply(np.log10))
    top = top.sort_values(by = "pi_value" , ascending = False)
    df = pd.DataFrame(columns = ["minimum_log2fc","minimum_MinuslogP","minimum_Pi", "TopCut"])
    rows = []
    for i in range(min_genes, max_genes,50):
        top_genes = top.iloc[0:i]
        minFC = min(abs(top_genes[logFC_col]))
        maxP = max(top_genes[pvalue_col])
        minP = - np.log10(maxP)
        
        minPi = min(top_genes["pi_value"])
        
        ##### ja foi ordenado, entao posso so pegar o elemento especifico ao invez de procurar o elemento de novo
 #       minPi = min(top_genes.iloc[i,3])##### essa passagem n faz sentido

        row = {"minimum_log2fc":minFC, "minimum_MinuslogP": minP , "minimum_Pi":minPi,"TopCut":i}
        rows.append(row)
    df = pd.DataFrame(rows)
    df.set_index("TopCut")
    return df


# # \_get_pathway

# In[60]:


def _get_pathway(merge_p, term2gene, all_genes, deg_list,gene_col, logFC_col, pvalue_col, direction, min_genes, max_genes, p_cut):
   
    top = get_top(direction, deg_list, max_genes, logFC_col)
    
    top["pi_value"] = top[logFC_col].apply(abs) * (- top[pvalue_col].apply(np.log10))
    top = top.sort_values(by = "pi_value", ascending =False).reset_index(drop = True)
    
      
    pathGs = []  #melhorar isso
    for i in range(min_genes , max_genes , 50):
        top_genes = top.loc[0:i,gene_col].astype(str)
        pathG = _run_enrich(top_genes, all_genes, gmt_file)
        pathG.columns = [ "term" , str(i)]
        pathG = pathG.set_index("term",drop=True)
        pathGs.append(pathG)
    merge_p = pd.concat(pathGs, axis=1, join = "outer")
    merge_p.fillna(1.0) # acho q n eh mais necessario, ORA faz sozinho
    
    merge_p2 = ( merge_p.apply(np.log10) )*(-1)
    
    path_cut_p = np.log10(p_cut)*(-1)
    
    df = summarizes_ORA_information(merge_p2, path_cut_p,direction)
    merge_p2 = pd.concat([df, merge_p2], axis=1)    

    
    merge_p2 = merge_p2.sort_values(by = "FirstTopCut_significant_" + direction , ascending = False)
    merge_p2 = merge_p2.drop(labels = ["TopCut_highestMinuslogP_" + direction ,
                  "maximum_MinuslogP_" + direction ,
                  "sum_MinuslogP_" + direction ,
                  "times_significant_" + direction  ,
                  "FirstTopCut_significant_" + direction , 
                  "PEBBA_score_" + direction], axis =1 )
    
    #refatorar toda essa nojeira legada
    
    return  df , merge_p2
    


# In[61]:


def get_top(direction, deg_list, max_genes, logFC_col):
    
    if(direction == "up"):
        top = deg_list.sort_values(by = logFC_col, ascending = False).head(n=max_genes)
    elif(direction =="down"):
        top = deg_list.sort_values(by= logFC_col, ascending = True).head(n=max_genes)
    elif(direction =="any"):
        deg_list[logFC_col] = deg_list[logFC_col].astype(np.float64) 
        deg_list[logFC_col] = deg_list[logFC_col].abs()
        top = deg_list.sort_values(by = logFC_col, ascending = True).head(n=max_genes)
    else:
        sys.exit("Invalid direction argument")
    return top


# In[62]:


def summarizes_ORA_information(merge_p2, path_cut_p, direction) :


    NG = merge_p2.idxmax(axis = 1) # O recorte de genes q apresentou o maior p valor possui NG genes
    NG = NG.astype(np.int64)
    p_max = merge_p2.max(axis = 1)
    p_sum = merge_p2.sum(axis = 1)
    
    num_columns_merge_p2 = merge_p2.shape[1]
    how_many_pathways_above_cut = calculate_how_many_pathways_above_cut(merge_p2,path_cut_p,axis =1)
     
    times = how_many_pathways_above_cut / num_columns_merge_p2
    
    ES3 = (1 - np.exp(- p_max) / (1 + (0.1 * np.sqrt(NG)) ) )
   
    first = merge_p2.apply(first_column_above_path_cut_p , axis = 1, path_cut_p=path_cut_p )
    first = first.apply(lambda x: merge_p2.columns[x] if x !=0 else 0 )
 
    dicionario = {"TopCut_highestMinuslogP_" + direction : NG,
                  "maximum_MinuslogP_" + direction : p_max ,
                  "sum_MinuslogP_" + direction : p_sum,
                  "times_significant_" + direction : times ,
                  "FirstTopCut_significant_" + direction : first, 
                  "PEBBA_score_" + direction : ES3}
    
    df = pd.DataFrame(dicionario)
    df["FirstTopCut_significant_" + direction] = df["FirstTopCut_significant_" + direction].astype(np.int64)
    
    return df



def _run_enrich(top_genes, all_genes, gmt_file):
    term2gene = GMTUtils.parse_gmt_file(gmt_file)
    df = ORA.run(top_genes, all_genes, term2gene).df
    df = df[["name", "fdr"]]
    return df   


# In[64]:


def first_column_above_path_cut_p(row, path_cut_p):
    for cont , element in enumerate(row):
        if element > path_cut_p:
            return cont
    
    return 0


# In[65]:


def calculate_how_many_pathways_above_cut(df, path_cut_p,axis):
    f = lambda x: x > path_cut_p
    how_many_pathways_above_cut =  df.apply(f,axis=1).sum(axis=axis)
    return how_many_pathways_above_cut   

# # \_cutoff_path
# In[59]:

def _cutoff_path(path_table, p_cut, direction):
    
    df_index = path_table.columns
    df = pd.DataFrame()
    df["MaxR"] = path_table.max()
    df["SumR"] = path_table.sum()
    path_cut_p = np.log10(p_cut) * (-1)
    
    
    #How many pathways above path_cut_p (freq)    
    how_many_pathways_above_cut = calculate_how_many_pathways_above_cut(path_table,path_cut_p, axis =0)
    n_rows =len(path_table.index) ######
    df["times"] = how_many_pathways_above_cut / n_rows
    
    df.columns = ["maximum_MinuslogP_"+ direction,
                  "sum_MinuslogP_"+ direction,
                  "times_significant_"+ direction]
    return df

# Run

In [56]:
pebba(file_in , gmt_file, force=True)

Getting cutoff
up
Getting Pathways
Getting Pathway Cutoff
      maximum_MinuslogP_up  sum_MinuslogP_up  times_significant_up
100              -0.000000          0.000000              0.000000
150               0.017535          0.017535              0.000000
200               2.263452          3.326417              0.002448
250               2.048301          5.376954              0.003672
300               1.717098          5.098998              0.004896
350               1.399754          5.614343              0.002448
400               1.127487          4.072345              0.002448
450               1.065725          5.121371              0.004896
500               0.855012          4.057010              0.004896
550               0.666252          4.979767              0.000000
600               1.007820          9.497538              0.001224
650               0.816472         12.637324              0.001224
700               0.722450         12.854333              0.006120
750 

## Coisas a checar proxima vez:
## GMTutils, refatorar tudo isso
