In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics
import matplotlib.ticker as mticker
from mpl_finance import candlestick2_ohlc

In [None]:
#directory = '/Users/filippo/Developer/tesi/results/hSBM/highlyvariable_7tissues'
directory = '/Users/filippo/Developer/tesi/gtex/hsbm/oversigma_5tissue'

In [None]:
L = 3

# topic size

In [None]:
def plot_topic_size(directory,l):
    df_topics = pd.read_csv("%s/topsbm/topsbm_level_%d_topics.csv"%(directory,l))
    sizes = []
    for t in df_topics.columns:
        sizes.append(len(df_topics.loc[:,t].dropna()))
    bins = np.linspace(0.5, np.max(sizes)+0.5, int((np.max(sizes)+1)/30))
    fig=plt.figure()
    plt.title("[%d topics, level: %d]"%(len(df_topics.columns),l))
    plt.hist(sizes, histtype='step', lw=2, bins=bins)
    plt.xlabel("topic size (# genes)", fontsize=16)
    plt.ylabel("# topic of that size", fontsize=16)
    plt.show()
    fig.savefig("%s/topic_size_level%d.png"%(directory,l))

In [None]:
for l in range(L+1):
    plot_topic_size(directory,l)

## Topic O

In [None]:
df_mv=pd.read_csv("meanVariances.csv", index_col=[0])
df_o = pd.read_csv("O.dat", header=None)
df_mv.insert(3, 'occurrence', df_o.values)
df_mv['occurrence'] = df_mv['occurrence']
df_mv.head()

In [None]:
def get_candles(directory, level, ax):
    df_topics = pd.read_csv("%s/topsbm/topsbm_level_%d_topics.csv"%(directory,level))
    candles = {
        'open': [],
        'high': [],
        'low': [],
        'close':[]
    }
    for topic in df_topics.columns:
        subarr = df_mv.loc[df_topics[topic].dropna(),:]['occurrence'].values
        avg = np.average(subarr)
        std = np.std(subarr)
        q=np.quantile(subarr,[0.25,0.75])
        candles['high'].append(np.min([1,avg+std]))
        candles['open'].append(np.min([q[1],1]))
        candles['close'].append(np.max([q[0],0]))
        candles['low'].append(np.max([0,avg-std]))
    ax.set_title("[level: %d]"%(l))
    ax.set_ylabel('$O_i$', fontsize=18)
    ax.set_xlim(-1,len(df_topics.columns))
    ax.set_xticks([i+1 for i in range(-1,len(df_topics.columns))])
    ax.set_xticklabels(["Topic %d"%(i+2) if ((i+2)%5==0 or i==-1) else '' for i in range(-1,len(df_topics.columns))],  rotation=60)
    return candles

In [None]:
for l in range(L+1):
    fig = plt.figure(figsize=(10,10))
    ax = fig.subplots(1)
    candles = get_candles(directory,l,ax)
    candlestick2_ohlc(ax, candles['open'],candles['high'],candles['low'],candles['close'],width=0.6,colordown='b')
    plt.show()
    fig.savefig("%s/topic_Ocandles_level_%d.pdf"%(directory,l))

# Geneontology

In [None]:
from geneontology import get_ontology_df, ensg_to_symbol
from tableanalyser import get_symbol

In [None]:
#import gseapy as gs
#gs.get_library_name()

In [None]:
back_sea=[]
df_world = pd.read_csv("%s/background.txt"%directory, header=None)
for g in df_world.values:
    try:
        back_sea.append(get_symbol(g[0]))
    except:
        print("Error %s"%g)

In [None]:
ontology = []
for g in df_topics[df_topics.columns[2]].dropna():
    try:
        ontology.append(get_symbol(g))
    except:
        print(g)

In [None]:
topic_pvalues = []

In [None]:
for topic in df_topics.columns:
    try:
        symbols = ensg_to_symbol(df_topics.loc[:,topic].dropna().values)
        print(topic, " - " ,len(symbols))
        enriched_topic = get_ontology_df(symbols, background=back_sea).sort_values(by=['Adjusted P-value'], ascending=True)
        enriched_topic = enriched_topic.loc[enriched_topic.index.values[:20],:]
        enriched_topic.to_csv("%s/gsea_level_%d_topic%s(%d).csv"%(directory,l,topic,len(symbols)))
        print(enriched_topic)
        topic_pvalues.append(enriched_topic.loc[enriched_topic.index.values[0],'Adjusted P-value'])
    except:
        pass