In [1]:
import pandas as pd
import scanpy as sc
import h5py
import CococoNet_reader
import numpy as np
import anndata
import pickle
import Go_annotations
import seaborn as sns
import matplotlib.pyplot as plt
sc.settings.verbosity = 3  
sc.set_figure_params(facecolor = 'white', figsize = (10,8))

In [2]:
import numpy as np
import pandas as pd
from scipy import stats, sparse
import bottleneck


def run_egad(go, nw, **kwargs):
    """EGAD running function
    
    Wrapper to lower level functions for EGAD

    EGAD measures modularity of gene lists in co-expression networks. 

    This was translated from the MATLAB version, which does tiled Cross Validation
    
    The useful kwargs are:
    int - nFold : Number of CV folds to do, default is 3, 
    int - {min,max}_count : limits for number of terms in each gene list, these are exclusive values


    Arguments:
        go {pd.DataFrame} -- dataframe of genes x terms of values [0,1], where 1 is included in gene lists
        nw {pd.DataFrame} -- dataframe of co-expression network, genes x genes
        **kwargs 
    
    Returns:
        pd.DataFrame -- dataframe of terms x metrics where the metrics are 
        ['AUC', 'AVG_NODE_DEGREE', 'DEGREE_NULL_AUC', 'P_Value']
    """
    assert nw.shape[0] == nw.shape[1] , 'Network is not square'
    assert np.all(nw.index == nw.columns) , 'Network index and columns are not in the same order'
    nw_mask = nw.isna().sum(axis=1) != nw.shape[1]
    nw = nw.loc[nw_mask, nw_mask].astype(float)
    np.fill_diagonal(nw.values, 1)
    return _runNV(go, nw, **kwargs)


def _runNV(go, nw, nFold=3, min_count=20, max_count=1000):

    #Make sure genes are same in go and nw
    genes_intersect = go.index.intersection(nw.index)

    go = go.loc[genes_intersect, :]
    nw = nw.loc[genes_intersect, genes_intersect]

    #Make sure there aren't duplicates
    duplicates = nw.index.duplicated(keep='first')
    nw = nw.loc[~duplicates, ~duplicates]

    go = go.loc[:, (go.sum(axis=0) > min_count) & (go.sum(axis=0) < max_count)]
    go = go.loc[~go.index.duplicated(keep='first'), :]

    roc = _new_egad(go.values, nw.values, nFold)

    col_names = ['AUC', 'AVG_NODE_DEGREE', 'DEGREE_NULL_AUC', 'P_Value']
    #Put output in dataframe
    return pd.DataFrame(dict(zip(col_names, roc)), index=go.columns)


def _new_egad(go, nw, nFold):

    #Build Cross validated Positive
    x, y = np.where(go)
    cvgo = {}
    for i in np.arange(nFold):
        a = x[i::nFold]
        b = y[i::nFold]
        dat = np.ones_like(a)
        mask = sparse.coo_matrix((dat, (a, b)), shape=go.shape)
        cvgo[i] = go - mask.toarray()
        
    CVgo = np.concatenate(list(cvgo.values()), axis=1)

    sumin = np.matmul(nw.T, CVgo)

    degree = np.sum(nw, axis=0)

    predicts = sumin / degree[:, None]

    np.place(predicts, CVgo > 0, np.nan)

    #Calculate ranks of positives
    rank_abs = lambda x: stats.rankdata(np.abs(x))
    predicts2 = np.apply_along_axis(rank_abs, 0, predicts)

    #Masking Nans that were ranked (how tiedrank works in matlab)
    predicts2[np.isnan(predicts)] = np.nan

    filtering = np.tile(go, nFold)

    #negatives :filtering == 0
    #Sets Ranks of negatives to 0
    np.place(predicts2, filtering == 0, 0)

    #Sum of ranks for each prediction
    p = bottleneck.nansum(predicts2, axis=0)

    #Number of predictions
    #Number of 1's masked for each GO term for each CV
    n_p = np.sum(filtering, axis=0) - np.sum(CVgo, axis=0)

    #Number of negatives
    #Number of GO terms - number of postiive
    n_n = filtering.shape[0] - np.sum(filtering, axis=0)

    roc = (p / n_p - (n_p + 1) / 2) / n_n
    U = roc * n_p * n_n
    Z = (np.abs(U - (n_p * n_n / 2))) / np.sqrt(n_p * n_n *
                                                (n_p + n_n + 1) / 12)
    roc = roc.reshape(nFold, go.shape[1])
    Z = Z.reshape(nFold, go.shape[1])
    #Stouffer Z method
    Z = bottleneck.nansum(Z, axis=0) / np.sqrt(nFold)
    #Calc ROC of Neighbor Voting
    roc = bottleneck.nanmean(roc, axis=0)
    P = stats.norm.sf(Z)

    #Average degree for nodes in each go term
    avg_degree = degree.dot(go) / np.sum(go, axis=0)

    #Calc null auc for degree
    ranks = np.tile(stats.rankdata(degree), (go.shape[1], 1)).T

    np.place(ranks, go == 0, 0)

    n_p = bottleneck.nansum(go, axis=0)
    nn = go.shape[0] - n_p
    p = bottleneck.nansum(ranks, axis=0)

    roc_null = (p / n_p - ((n_p + 1) / 2)) / nn

    return roc, avg_degree, roc_null, P


In [3]:
arabi_net_both_trimmed = pd.read_hdf('/data/passala/Generated_Tables/Temp_junk/arabi_network_trimmed_to_sc_net.h5', )
corr_results_both_trimmed= pd.read_hdf('/data/passala/Generated_Tables/Temp_junk/sc_net_trimmed.h5', )
wide_go= pd.read_hdf('/data/passala/Generated_Tables/Temp_junk/wide_go_for_egad.h5', )

In [4]:
arabi_net_both_trimmed

Unnamed: 0,AT1G01020,AT1G01030,AT1G01040,AT1G01050,AT1G01060,AT1G01070,AT1G01080,AT1G01090,AT1G01100,AT1G01110,...,ATCG01050,ATCG01060,ATCG01070,ATCG01080,ATCG01090,ATCG01100,ATCG01110,ATCG01120,ATCG01130,ATCG01310
AT1G01020,1.000000,0.676041,0.885107,0.916603,0.167027,0.625596,0.801175,0.866369,0.954286,0.715750,...,0.266479,0.190439,0.111731,0.193853,0.261697,0.299060,0.321617,0.287853,0.559907,0.091015
AT1G01030,0.676041,1.000000,0.707718,0.515532,0.420354,0.292083,0.687003,0.626281,0.483536,0.570675,...,0.365060,0.263471,0.245167,0.215414,0.255414,0.306986,0.308402,0.270118,0.463394,0.108780
AT1G01040,0.885107,0.707718,1.000000,0.836387,0.545101,0.701558,0.656258,0.857046,0.657204,0.650843,...,0.302657,0.170802,0.155183,0.230038,0.149740,0.334558,0.287951,0.186425,0.219380,0.057761
AT1G01050,0.916603,0.515532,0.836387,1.000000,0.562112,0.726964,0.837948,0.928252,0.952045,0.640223,...,0.173902,0.216276,0.089440,0.177388,0.164935,0.236661,0.251851,0.213062,0.412583,0.086736
AT1G01060,0.167027,0.420354,0.545101,0.562112,1.000000,0.327202,0.242215,0.304953,0.217415,0.077569,...,0.097794,0.114594,0.082518,0.125455,0.103488,0.187982,0.195059,0.125488,0.212731,0.044208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ATCG01100,0.299060,0.306986,0.334558,0.236661,0.187982,0.085623,0.439777,0.367577,0.094098,0.126890,...,0.998698,0.988851,0.992374,0.997454,0.991884,1.000000,0.996139,0.941864,0.958553,0.496910
ATCG01110,0.321617,0.308402,0.287951,0.251851,0.195059,0.060340,0.357794,0.387466,0.158726,0.163126,...,0.987717,0.944675,0.958313,0.985484,0.972589,0.996139,1.000000,0.954421,0.960399,0.453522
ATCG01120,0.287853,0.270118,0.186425,0.213062,0.125488,0.083681,0.364707,0.333718,0.237716,0.149145,...,0.905485,0.790083,0.858389,0.905572,0.926062,0.941864,0.954421,1.000000,0.969018,0.410586
ATCG01130,0.559907,0.463394,0.219380,0.412583,0.212731,0.077222,0.625678,0.496059,0.428496,0.226147,...,0.962779,0.820623,0.881835,0.936241,0.946935,0.958553,0.960399,0.969018,1.000000,0.483002


In [5]:
arabi_egad_results = run_egad(go = wide_go, nw = arabi_net_both_trimmed,min_count =20, max_count = 2000 )

In [6]:
sc_egad_results = run_egad(go = wide_go, nw = corr_results_both_trimmed,min_count =20, max_count = 2000)

In [7]:
arabi_egad_results

Unnamed: 0_level_0,AUC,AVG_NODE_DEGREE,DEGREE_NULL_AUC,P_Value
GO Annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GO:0000027,0.896899,12050.224959,0.641293,9.365685e-11
GO:0000028,0.903498,11822.423110,0.598167,5.753969e-12
GO:0000045,0.678646,12260.321920,0.678162,1.435415e-03
GO:0000049,0.751929,13282.197639,0.797360,7.513195e-06
GO:0000079,0.860451,10646.541130,0.481282,1.461594e-15
...,...,...,...,...
GO:2000067,0.796112,8740.239787,0.375776,2.965461e-06
GO:2000241,0.702638,11249.446617,0.586299,1.675248e-22
GO:2000280,0.771055,8066.674153,0.318631,2.728419e-10
GO:2000377,0.625962,10165.610523,0.491693,6.558963e-03


In [8]:
sc_egad_results

Unnamed: 0_level_0,AUC,AVG_NODE_DEGREE,DEGREE_NULL_AUC,P_Value
GO Annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GO:0000027,0.813306,5795.698235,0.720498,1.034342e-07
GO:0000028,0.837935,6133.206230,0.757508,4.656483e-09
GO:0000045,0.651447,5055.686971,0.617166,5.419283e-03
GO:0000049,0.706912,5981.509329,0.732277,7.983105e-05
GO:0000079,0.721633,3950.357975,0.449376,1.512204e-06
...,...,...,...,...
GO:2000067,0.512467,3162.537849,0.349426,1.054035e-01
GO:2000241,0.637667,4525.511559,0.533330,2.005567e-11
GO:2000280,0.537624,3159.971527,0.351469,5.710320e-02
GO:2000377,0.434955,3982.961364,0.464269,9.245144e-02


In [9]:
arabi_egad_results.to_csv('/data/passala/Generated_Tables/Temp_junk/arabi_egad_results.csv')
sc_egad_results.to_csv('/data/passala/Generated_Tables/Temp_junk/sc_egad_results.csv')