In [80]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from topicpy.geneontology import get_symbol
import plotly.graph_objects as go
import logging
log = logging.getLogger("go_expression")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
df_A = pd.read_csv("../mouse_Atlas/A_gtex.dat", index_col = 0)

In [None]:
#GTEx
df_generator = pd.read_csv('../GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct', skiprows=2, sep='\t', chunksize=1000)

log.info("read generator")

A = pd.DataFrame(columns = ["ensg", "abundance"])

log.info("first chunk OK")

for subdf in df_generator:
    subdf['ensg'] = [x[:15] for x in subdf['Name']]
    subdf.drop(["Name", "Description"], axis=1, inplace=True)
    subdf["abundance"] = subdf.sum(axis=1)
    A = A.append(subdf.reindex(columns=["ensg", "abundance"]))
    log.debug("new chunk")

A.set_index("ensg", inplace=True)
    
A.head(2)

In [None]:
df_go = pd.DataFrame()
with open("../MSigDB/c5.all.v7.1.symbols.gmt", "r") as gmt_file:
    for line in gmt_file.read().split("\n"):
        if len(line) < 1:
            break
        data = line.replace("\n","").split("\t")
        s = pd.Series(name = data[0], data = data[2:])
        df_go.insert(0,s.name,s)
df_go.head(2)

In [None]:
df_conversion = pd.read_csv("https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_pub_refseq_ids&col=gd_pub_ensembl_id&col=md_eg_id&col=md_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit", sep="\t")
df_conversion.head(2)

In [None]:
A["Name"] = list(map(get_symbol, A.index))
A["frequency"] = A["abundance"]/A["abundance"].sum()

In [None]:
gos = df_go.columns

In [None]:
def get_gene_abundances(genes: list)->np.array:
    return A[A["Name"].isin(genes)]["frequency"].values

def get_genes_in_go(go:str)->list:
    return df_go.loc[:,go].dropna()

def get_average_expression(go:str, thr = 25)->float:
    A = get_gene_abundances(get_genes_in_go(go))
    if len(A) > thr:
        return A.mean(), A.std()
    else:
        return np.nan
    
def get_expression(go:str, thr = 25)->float:
    A = get_gene_abundances(get_genes_in_go(go))
    if len(A) > thr:
        return go, get_gene_abundances(get_genes_in_go(go))
    else:
        return None
    
def get_box(name:str, data: np.array)->go.Box:
    return go.Box(y=data, name=name)

In [None]:
boxes = list(map(lambda nameddata: get_box(*nameddata), filter(lambda x: x is not None, map(get_expression,gos))))

In [None]:
fig = go.Figure()

fig.add_traces(boxes)

layout = {
    "yaxis":{
        "title": "frequency",
        "titlefont":{
            "size": 20
        },
        "type": "log",
        "exponentformat": "e"
    },
    "xaxis":{
      "tickfont":{
          "size": 10,
      },
        "tickangle": 85
    },
    "showlegend": False
}

fig.update_layout(layout)

fig.show()