# Setup

In [12]:
import os
import sys
#import utils
import pandas as pd



from pypathway import GMTUtils

import importlib
# import pebba_python
# importlib.reload(pebba_python)

# from pebba_python import *


import numpy as np
from pypathway import ORA

In [3]:
file_in = "data/GSE49757_Septic_vs_Healthy.txt"
gmt_file = "data/Reactome_2016_15and100Genes.gmt"

# funcoes pebba

In [4]:
def pebba(file_in, 
          gmt_file, 
          gene_col="Gene.symbol",
          logFC_col="logFC",
          pvalue_col="P.Value",
          min_genes=100,
          max_genes=1500,
          p_cut=0.2,
          verbose=True,
          analysis_name= None, 
          results_dir="Results",
          force=False):

    validates_inputs(min_genes,max_genes,p_cut)

    create_results_directory(results_dir,force)

    

    
    ## Get information from all unique terms
    
    term2gene , path_desc , merge_p  = read_gmt_hier(gmt_file) #utils.read_gmt_hier(gmt_file)    

# alterei funcao pra retornar o merge_p, 
# ta retornando no formato de vetor mas talvez tenha q mudar para dataframe ou serie
    
    
    
    if( isinstance(file_in , str)):
        deg_list = pd.read_csv(file_in,  sep = "\t")#header true no original, so tirei o header pra o python inferir
        if(analysis_name is None):
            analysis_name =   os.path.splitext(os.path.basename(file_in) )[0] # pega o basename e tira a extensao
     
    elif(isinstance(file_in , pd.DataFrame)):
        deg_list = file_in
    

    if(analysis_name is None):
        analysis_name = "PEBBA_analysis"
    
    ## Remove rows that do not have a valid gene symbol
    deg_list = deg_list.dropna()
    ## Get background genes as a character vector
    ## Empty values (non-annotated genes) will be removed
    all_genes = deg_list["Gene.symbol"] # passar pra lista?
    
    # Get cutoff values -------------------------------------------------------
    if(verbose):
        print("Getting cutoff") # no original era message(), posso usar um print?
        
    table_cut = _get_cutoff(deg_list, logFC_col, pvalue_col, min_genes, max_genes)
    
    directions = ["up", "down" , "any"]
    
    cut_paths =[]
    dfs = []
    paths = []
    for direction in directions:
        if (verbose):
            print(direction + "\nGetting Pathways")
        df , path = _get_pathway(merge_p, term2gene, all_genes,
                            deg_list, gene_col, logFC_col,
                            pvalue_col, direction,
                            min_genes, max_genes, p_cut)
        if (verbose):
            print("Getting Pathway Cutoff")
        
        cut_path = _cutoff_path(path, p_cut, direction)
        
        cut_paths.append(cut_path)
        dfs.append(df)
        paths.append(path)
    
    
    ## Save heatmaps
    if(verbose):
        print("Saving heatmaps")
    

In [5]:
def validates_inputs(min_genes,max_genes,p_cut):

    if(min_genes < 50 or min_genes > 2900):
        sys.exit("Variable min_genes must be between 50 and 2900 genes")
          
    if(max_genes < 100 or max_genes > 3000):
        sys.exit("Variable max_genes must be between 100 and 3000 genes")  
    
    if(p_cut < 0.00001 or p_cut > 1):
        sys.exit("Variable p_cut must be between 0.00001 and 1")       

In [6]:
def create_results_directory(results_dir,force):
    results_dir = os.path.abspath(results_dir)
    if not os.path.exists(results_dir): 
        os.makedirs("Results/Tables") 
        os.makedirs("Results/Heatmaps")      
    else:    
        if( not force): 
            sys.exit("Stopping analysis: ", results_dir, " already exists! Use force=True to overwrite.")

# read_gmt

In [7]:

def read_gmt_hier(file_name):
    '''
    file_name string -> dataframe and dictionary
    
    Reads .gmt file and returns a Pandas DataFrame and a dictionary with the information of the gmt file in a format ready to be used.
    
    '''
    gmt_names = []
    gmt_desc  = []
    gmt_genes = []
    res =pd.DataFrame()

    with open(file_name, 'r') as f:
        # separar cada elemento separado por tab e guardar eles
        for line in f:
            gmt_names.append(line.split("\t")[0])
            gmt_desc.append(line.split("\t")[1])
            gmt_genes.append(line.split("\t")[2:])

    for i in range(len(gmt_genes)) :

        # apagar \n presente no ultimo gene de cada lista (artefato da leitura do arquivo)
        gmt_genes[i][-1] = gmt_genes[i][-1].replace("\n", "")

        # Poem na forma de um dataframe, cada linha um gene e suas informações relativas 
        temp= pd.DataFrame({'term': [gmt_names[i]]*len(gmt_genes[i]), 'hier':  [gmt_desc[i]]*len(gmt_genes[i]), 'gene' : gmt_genes[i] })
        res = pd.concat([res,temp])

    # reseta o indice
    res = res.reset_index(drop=True)    

    #relação entre nomes e descricões  (é pra isso q essa variavel serve? no original é um datafra esquisito, achei q assim ia ser mais otimizado)   
    path_desc = dict(zip(gmt_names,gmt_desc))

    return res, path_desc , gmt_names


# copy pastes e testes aleatorios

In [2]:
import pandas as pd

In [9]:
serie_temp = pd.Series([7,0.124,124.12, -12342.3423,-0.123812,4,-13,5])

In [12]:
serie_temp

0        7.000000
1        0.124000
2      124.120000
3   -12342.342300
4       -0.123812
5        4.000000
6      -13.000000
7        5.000000
dtype: float64

# Run

In [56]:
pebba(file_in , gmt_file, force=True)

Getting cutoff
up
Getting Pathways
Getting Pathway Cutoff
      maximum_MinuslogP_up  sum_MinuslogP_up  times_significant_up
100              -0.000000          0.000000              0.000000
150               0.017535          0.017535              0.000000
200               2.263452          3.326417              0.002448
250               2.048301          5.376954              0.003672
300               1.717098          5.098998              0.004896
350               1.399754          5.614343              0.002448
400               1.127487          4.072345              0.002448
450               1.065725          5.121371              0.004896
500               0.855012          4.057010              0.004896
550               0.666252          4.979767              0.000000
600               1.007820          9.497538              0.001224
650               0.816472         12.637324              0.001224
700               0.722450         12.854333              0.006120
750 

## Coisas a checar proxima vez:
## GMTutils, refatorar tudo isso
