In [None]:
from platform import python_version
print(python_version())

In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import plotly.graph_objects as go

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from nlp_cluster_lib import *
from biopax_lib import *
from gemini_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

email = "flalix@gmail.com"

# !pip3 install pyyaml
with open('config_medulloblastoma.yml', 'r') as file:
    dic_yml = yaml.safe_load(file)

print(dic_yml)

In [None]:
root_chibe = dic_yml['root_chibe']
root_colab = dic_yml['root_colab']
root0 = dic_yml['root0']

project = dic_yml['project']
s_project = dic_yml['s_project']

gene_protein = dic_yml['gene_protein']
s_omics = dic_yml['s_omics']

has_age = dic_yml['has_age']
has_gender = dic_yml['has_gender']

want_normalized = dic_yml['want_normalized']

abs_lfc_cutoff_inf = dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method = dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr = dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index = dic_yml['tolerance_pathway_index']
type_sat_ptw_index = dic_yml['type_sat_ptw_index']
saturation_lfc_index = dic_yml['saturation_lfc_index']
chosen_model_sampling = dic_yml['chosen_model_sampling']

case_list = dic_yml['case_list']

pval_pathway_cutoff = dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff = dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff = dic_yml['num_of_genes_cutoff']

run_list = dic_yml['run_list']
chosen_model_list = dic_yml['chosen_model_list']
i_dfp_list = dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')


print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
bpx = Biopax(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method)

case = case_list[0]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

In [None]:
##########################
is_seldata=False
##########################

In [None]:
API_KEY = dic_yml['API_KEY']

disease = dic_yml['disease']
context_disease = dic_yml['context_disease']
n_sentences = dic_yml['n_sentences']
chosen_model_sampling = dic_yml['chosen_model_sampling']

##################
i_dfp_list = [0]
##################
gem = Gemini( bpx=bpx, is_seldata=is_seldata, disease=disease, context_disease=context_disease, 
             API_KEY=API_KEY, n_sentences=n_sentences, root0=root0, 
             chosen_model_list=chosen_model_list, i_dfp_list=i_dfp_list, chosen_model_sampling=chosen_model_sampling)
print("\n")
print(gem.disease, gem.is_seldata)
print("Context:", context_disease)

In [None]:
root0 = '../../colaboracoes/aparecida/'
root_data = os.path.join(root0, 'data')
root_data

In [None]:
fname_pheno = "pheno_meduloblastoma.tsv"

dfp = pdreadcsv(fname_pheno, root_data)
dfp.head(3)

In [None]:
dfp.group.unique()

In [None]:
fname = 'medulloblastoma_expression_all_cols_log2_not_normalization.tsv'
filefull = os.path.join(root_data, fname)
print(filefull)
df_exp = pdreadcsv(fname, root_data)
df_exp = df_exp[~pd.isnull(df_exp.symbol)]
df_exp.index = np.arange(len(df_exp))

print(len(df_exp))
df_exp.head(4).T

In [None]:
all_cols0 = df_exp.columns
all_cols0

In [None]:
all_cols = all_cols0[3:]
all_cols

In [None]:
groups = list(dfp.group.unique())
groups.sort()
groups

In [None]:
groups = ['C4', 'WNT']

In [None]:
cols_g4 = dfp[dfp.group == groups[0]].accession.to_list()
cols_g4

In [None]:
cols_wnt = dfp[dfp.group == groups[1]].accession.to_list()
cols_wnt

In [None]:
cols_ctr = dfp[dfp.group == 'CTRL'].accession.to_list()
cols_ctr

In [None]:
groups=['ctrl', 'G4', 'WNT']
    
for group in groups:
    if group == 'ctrl':
        dfctrl = df_exp[ ['symbol'] + cols_ctr]

    elif group == 'G4':
        dfg4 = df_exp[ ['symbol'] + cols_g4 ]

    elif group == 'WNT':
        dfwnt = df_exp[ ['symbol'] + cols_wnt ]

len(dfctrl), len(dfg4), len(dfwnt)

In [None]:
dfctrl.head(3)

In [None]:
dfg4.head(3)

In [None]:
dfwnt.head(3)

In [None]:
groups

In [None]:
title=symbol
width=1100
height=800
z_title='gene'

symbol='CACNB2'

fig = go.Figure()

for group in groups:

        if group == 'ctrl':
            dfa = dfctrl[dfctrl.symbol==symbol].iloc[:,1:]
            name = 'control'

        elif group == 'G4':
            dfa = dfg4[dfg4.symbol==symbol].iloc[:,1:]
            name = 'G4'
   
        elif group == 'WNT':
            dfa = dfwnt[dfwnt.symbol==symbol].iloc[:,1:]
            name = 'WNT'
            
        df3 = dfa.T
        df3 = df3.mean(axis=1)
        df3.columns = ['expression']
        # df3 = np.round(df3.expression, 3)
        fig.add_trace(go.Box(y=df3, name=name))


fig.update_layout(
    width=width, 
    height=height,
    title=title, 
    title_x=0.5,
    coloraxis=dict(
        colorscale=colorscale,
        colorbar=dict(
            title=dict(
                text=z_title,
                side='right'),
            xanchor='right', x=1.0,
            xpad=0,
            ticks='inside')),
    legend=dict(
        yanchor='top',
        xanchor='right',
        bgcolor='rgba(0, 0, 0, 0)',
        itemclick=False,
        itemdoubleclick=False),
    showlegend=True)
        
fig.show()

In [None]:
verbose=False

run='run01'

case = case_list[0]

dfg = gem.group_discovery_fp_fn(run=run, case=case, chosen_model_list=chosen_model_list, 
                                force=False, verbose=True)
print(len(dfg))
df_fn = dfg[dfg.group == 'FN']
df_fn

In [None]:
ret, _, _, _ = gem.bpx.open_case(case)
dflfc = gem.bpx.dflfc

dflfc.head(3)

In [None]:
df_enr = gem.bpx.df_enr
df_enr.head(3)

In [None]:
i=0
pathway_id = df_fn.iloc[i].pathway_id
pathway = df_fn.iloc[i].pathway

pathway_id, pathway

In [None]:
dfe = df_enr[df_enr.pathway_id == pathway_id]
dfe

In [None]:
lista = dfe.iloc[0].genes
if isinstance(lista, str):
    lista = eval(lista)

print(len(lista))
lista

In [None]:
def boxplot_symbols_per_case(case, pathway_id, pathway, symbol, width=1100, height=800, z_title='gene expression'):
    title=f"{pathway} ({pathway_id})<br>case {case} - {gem.bpx.s_gene_protein}: {symbol}"

    fig = go.Figure()
    found_data = False
    for group in groups:
    
            if group == 'ctrl':
                dfa = dfctrl[dfctrl.symbol==symbol].iloc[:,1:]
                name = 'control'
    
            elif group == 'G4':
                dfa = dfg4[dfg4.symbol==symbol].iloc[:,1:]
                name = 'G4'
       
            elif group == 'WNT':
                dfa = dfwnt[dfwnt.symbol==symbol].iloc[:,1:]
                name = 'WNT'

            if dfa.empty:
                print(f"Could nto find {symbol} in {group}")
                continue
                
            found_data = True
            
            df3 = dfa.T
            # repeated probes
            mean_exp = df3.mean(axis=1)
            fig.add_trace(go.Box(y=mean_exp, jitter=0.3, pointpos=0, boxpoints='all',  name=name))  #  marker_color='black'
    
    if not found_data: return None
    
    fig.update_layout(
        width=width, 
        height=height,
        title=title, 
        title_x=0.5,
        coloraxis=dict(
            colorscale=colorscale,
            colorbar=dict(
                title=dict(
                    text=z_title,
                    side='right'),
                xanchor='right', x=1.0,
                xpad=0,
                ticks='inside')),
        legend=dict(
            yanchor='top',
            xanchor='right',
            bgcolor='rgba(0, 0, 0, 0)',
            itemclick=False,
            itemdoubleclick=False),
        showlegend=True)
            
    return fig

In [None]:
lista

In [None]:
i=0
for symbol in lista:
    i+=1
    #if i==3: break
    fig = boxplot_symbols_per_case(case, pathway_id, pathway, symbol, width=1300, height=800, z_title='gene expression')
    if fig: fig.show()

In [None]:
symbol

In [None]:
df_exp[df_exp.symbol==symbol]

In [None]:
i=0
for symbol in lista:
    ret = symbol in df_exp.symbol
    print(symbol, ret)