In [1]:
import ot
import pandas as pd
import numpy as np
import os

In [2]:
def kl_divergence_backend(X, Y):
    """
    Returns pairwise KL divergence (over all pairs of samples) of two matrices X and Y.
    
    Takes advantage of POT backend to speed up computation.
    
    Args:
        X: np array with dim (n_samples by n_features)
        Y: np array with dim (m_samples by n_features)
    
    Returns:
        D: np array with dim (n_samples by m_samples). Pairwise KL divergence matrix.
    """
    assert X.shape[1] == Y.shape[1], "X and Y do not have the same number of features."

    nx = ot.backend.get_backend(X,Y)
    
    X = X/nx.sum(X,axis=1, keepdims=True)
    Y = Y/nx.sum(Y,axis=1, keepdims=True)
    log_X = nx.log(X)
    log_Y = nx.log(Y)
    X_log_X = nx.einsum('ij,ij->i',X,log_X)
    X_log_X = nx.reshape(X_log_X,(1,X_log_X.shape[0]))
    D = X_log_X.T - nx.dot(X,log_Y.T)
    return nx.to_numpy(D)

def f(G):
    return ot.gromov.gwloss(constC, hC1, hC2, G)

def df(G):
    return ot.gromov.gwggrad(constC, hC1, hC2, G)


In [3]:
dissimilarity='euc'
norm=False
alpha=0.8
loss_fun='square_loss'
backend=ot.backend.NumpyBackend()
nx=backend

simulate

In [23]:
dirs="/data02/tguo/space_batch_effect/simulate/gtt_output/coordinate_file/"
batch_sim="_1"
types="_types4"
clusters=pd.read_csv(dirs+"gtt_clusters"+batch_sim+types+".csv",header=0,index_col=0,sep=",")
ub=np.unique(clusters['batch'])
for i in np.arange(1,len(ub)):
    uc=np.intersect1d(clusters['clusters'][clusters['batch']==ub[0]],
                     clusters['clusters'][clusters['batch']==ub[i]])
    for clust in uc:
        embed1=pd.read_csv(dirs+"embed_"+ub[0]+"_"+str(clust)+batch_sim+types+".csv",header=0,index_col=0,sep=",")
        embed2=pd.read_csv(dirs+"embed_"+ub[i]+"_"+str(clust)+batch_sim+types+".csv",header=0,index_col=0,sep=",")
        coord1=pd.read_csv(dirs+"coord_"+ub[0]+"_"+str(clust)+batch_sim+types+".csv",header=0,index_col=0,sep=",")
        coord2=pd.read_csv(dirs+"coord_"+ub[i]+"_"+str(clust)+batch_sim+types+".csv",header=0,index_col=0,sep=",")
        coord1=coord1.loc[embed1.index,:]
        coord2=coord2.loc[embed2.index,:]
        ###每个batch内部spot的空间距离####
        a=np.float64(nx.from_numpy(coord1.values[:,:2]))
        b=np.float64(nx.from_numpy(coord2.values[:,:2]))
        D1=ot.dist(a,a, metric='euclidean')
        D2=ot.dist(b,b, metric='euclidean')
        if norm:
            D1 /= nx.min(D1[D1>0])
            D2 /= nx.min(D2[D2>0])
        ####两个batch spot的低维表示的距离#####
        X1,X2 = nx.from_numpy(embed1.values), nx.from_numpy(embed2.values)
        if dissimilarity.lower()=='euclidean' or dissimilarity.lower()=='euc':
            M = ot.dist(X1,X2)
        else:
            s1 = X1 + 0.01
            s2 = X2 + 0.01
            M = kl_divergence_backend(s1, s2)
            M = nx.from_numpy(M)
        ####每个batch的spot的分布#####
        d1 = nx.ones((embed1.shape[0],))/embed1.shape[0]
        d2 = nx.ones((embed2.shape[0],))/embed2.shape[0]
        ####计算mapping#####
        constC, hC1, hC2 = ot.gromov.init_matrix(D1, D2, d1, d2, loss_fun)
        G0 = d1[:, None] * d2[None, :]
        res=ot.gromov.cg(d1, d2, (1 - alpha) * M, alpha, f, df, G0, armijo=False, C1=D1, C2=D2, constC=constC)
        pi=pd.DataFrame(res,index=embed1.index,columns=embed2.index)
        pi.to_csv(dirs+"gwd_pi_"+ub[0]+"_"+ub[i]+"_"+str(clust)+batch_sim+types+".csv")

DLPFC

In [63]:
alpha=0.8
dirs="/data02/tguo/space_batch_effect/human_DLPFC_10x/gtt_output/coordinate_file/"
samples=np.array(['151507','151508','151509','151510','151669','151670','151671','151672','151673','151674','151675','151676'])
samples=samples[[8,9,10,11]]
flags=""
for i in samples:
    flags=flags+"_"+i
clusters=pd.read_csv(dirs+"gtt_clusters"+flags+".csv",header=0,index_col=0,sep=",")
ub=np.unique(clusters['batch'])
for i in np.arange(1,len(ub)):
    uc=np.intersect1d(clusters['clusters'][clusters['batch']==ub[0]],
                     clusters['clusters'][clusters['batch']==ub[i]])
    for clust in uc:
        embed1=pd.read_csv(dirs+"embed_"+str(ub[0])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        embed2=pd.read_csv(dirs+"embed_"+str(ub[i])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord1=pd.read_csv(dirs+"coord_"+str(ub[0])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord2=pd.read_csv(dirs+"coord_"+str(ub[i])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord1=coord1.loc[embed1.index,:]
        coord2=coord2.loc[embed2.index,:]
        ###每个batch内部spot的空间距离####
        a=np.float64(nx.from_numpy(coord1.values[:,:2]))
        b=np.float64(nx.from_numpy(coord2.values[:,:2]))
        D1=ot.dist(a,a, metric='euclidean')
        D2=ot.dist(b,b, metric='euclidean')
        if norm:
            D1 /= nx.min(D1[D1>0])
            D2 /= nx.min(D2[D2>0])
        ####两个batch spot的低维表示的距离#####
        X1,X2 = nx.from_numpy(embed1.values), nx.from_numpy(embed2.values)
        if dissimilarity.lower()=='euclidean' or dissimilarity.lower()=='euc':
            M = ot.dist(X1,X2)
        else:
            s1 = X1 + 0.01
            s2 = X2 + 0.01
            M = kl_divergence_backend(s1, s2)
            M = nx.from_numpy(M)
        ####每个batch的spot的分布#####
        d1 = nx.ones((embed1.shape[0],))/embed1.shape[0]
        d2 = nx.ones((embed2.shape[0],))/embed2.shape[0]
        ####计算mapping#####
        constC, hC1, hC2 = ot.gromov.init_matrix(D1, D2, d1, d2, loss_fun)
        G0 = d1[:, None] * d2[None, :]
        res=ot.gromov.cg(d1, d2, (1 - alpha) * M, alpha, f, df, G0, armijo=False, C1=D1, C2=D2, constC=constC)
        pi=pd.DataFrame(res,index=embed1.index,columns=embed2.index)
        pi.to_csv(dirs+"gwd_pi_"+str(ub[0])+"_"+str(ub[i])+"_"+str(clust)+flags+".csv")

mouse brain

In [7]:
dirs="/data02/tguo/space_batch_effect/mouse_brain/gtt_output/coordinate_file/"
samples=['anterior1','anterior2']
flags=""
for i in samples:
    flags=flags+"_"+i
clusters=pd.read_csv(dirs+"gtt_clusters"+flags+".csv",header=0,index_col=0,sep=",")
ub=np.unique(clusters['batch'])
for i in np.arange(1,len(ub)):
    uc=np.intersect1d(clusters['clusters'][clusters['batch']==ub[0]],
                     clusters['clusters'][clusters['batch']==ub[i]])
    for clust in uc:
        embed1=pd.read_csv(dirs+"embed_"+str(ub[0])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        embed2=pd.read_csv(dirs+"embed_"+str(ub[i])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord1=pd.read_csv(dirs+"coord_"+str(ub[0])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord2=pd.read_csv(dirs+"coord_"+str(ub[i])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord1=coord1.loc[embed1.index,:]
        coord2=coord2.loc[embed2.index,:]
        ###每个batch内部spot的空间距离####
        a=np.float64(nx.from_numpy(coord1.values[:,:2]))
        b=np.float64(nx.from_numpy(coord2.values[:,:2]))
        D1=ot.dist(a,a, metric='euclidean')
        D2=ot.dist(b,b, metric='euclidean')
        if norm:
            D1 /= nx.min(D1[D1>0])
            D2 /= nx.min(D2[D2>0])
        ####两个batch spot的低维表示的距离#####
        X1,X2 = nx.from_numpy(embed1.values), nx.from_numpy(embed2.values)
        if dissimilarity.lower()=='euclidean' or dissimilarity.lower()=='euc':
            M = ot.dist(X1,X2)
        else:
            s1 = X1 + 0.01
            s2 = X2 + 0.01
            M = kl_divergence_backend(s1, s2)
            M = nx.from_numpy(M)
        ####每个batch的spot的分布#####
        d1 = nx.ones((embed1.shape[0],))/embed1.shape[0]
        d2 = nx.ones((embed2.shape[0],))/embed2.shape[0]
        ####计算mapping#####
        constC, hC1, hC2 = ot.gromov.init_matrix(D1, D2, d1, d2, loss_fun)
        G0 = d1[:, None] * d2[None, :]
        res=ot.gromov.cg(d1, d2, (1 - alpha) * M, alpha, f, df, G0, armijo=False, C1=D1, C2=D2, constC=constC)
        pi=pd.DataFrame(res,index=embed1.index,columns=embed2.index)
        pi.to_csv(dirs+"gwd_pi_"+str(ub[0])+"_"+str(ub[i])+"_"+str(clust)+flags+".csv")

10x 冠状面

In [10]:
alpha=0.8
dirs="/data02/tguo/space_batch_effect/Hippo/gtt_output/coordinate_file/"
samples=['10X_Normal','10X_DAPI','10X_FFPE']
flags=""
for i in samples:
    flags=flags+"_"+i
clusters=pd.read_csv(dirs+"gtt_clusters"+flags+".csv",header=0,index_col=0,sep=",")
ub=np.unique(clusters['batch'])
ub=['10X_Normal','10X_DAPI','10X_FFPE']
for i in np.arange(1,len(ub)):
    uc=np.intersect1d(clusters['clusters'][clusters['batch']==ub[0]],
                     clusters['clusters'][clusters['batch']==ub[i]])
    for clust in uc:
        embed1=pd.read_csv(dirs+"embed_"+str(ub[0])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        embed2=pd.read_csv(dirs+"embed_"+str(ub[i])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord1=pd.read_csv(dirs+"coord_"+str(ub[0])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord2=pd.read_csv(dirs+"coord_"+str(ub[i])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord1=coord1.loc[embed1.index,:]
        coord2=coord2.loc[embed2.index,:]
        ###每个batch内部spot的空间距离####
        a=np.float64(nx.from_numpy(coord1.values[:,:2]))
        b=np.float64(nx.from_numpy(coord2.values[:,:2]))
        D1=ot.dist(a,a, metric='euclidean')
        D2=ot.dist(b,b, metric='euclidean')
        if norm:
            D1 /= nx.min(D1[D1>0])
            D2 /= nx.min(D2[D2>0])
        ####两个batch spot的低维表示的距离#####
        X1,X2 = nx.from_numpy(embed1.values), nx.from_numpy(embed2.values)
        if dissimilarity.lower()=='euclidean' or dissimilarity.lower()=='euc':
            M = ot.dist(X1,X2)
        else:
            s1 = X1 + 0.01
            s2 = X2 + 0.01
            M = kl_divergence_backend(s1, s2)
            M = nx.from_numpy(M)
        ####每个batch的spot的分布#####
        d1 = nx.ones((embed1.shape[0],))/embed1.shape[0]
        d2 = nx.ones((embed2.shape[0],))/embed2.shape[0]
        ####计算mapping#####
        constC, hC1, hC2 = ot.gromov.init_matrix(D1, D2, d1, d2, loss_fun)
        G0 = d1[:, None] * d2[None, :]
        res=ot.gromov.cg(d1, d2, (1 - alpha) * M, alpha, f, df, G0, armijo=False, C1=D1, C2=D2, constC=constC)
        pi=pd.DataFrame(res,index=embed1.index,columns=embed2.index)
        pi.to_csv(dirs+"gwd_pi_"+str(ub[0])+"_"+str(ub[i])+"_"+str(clust)+flags+".csv")

mouse ob

In [18]:
alpha=0.1
dirs="/data02/tguo/space_batch_effect/mouse_OB/gtt_output/coordinate_file/"
samples=['BGI','SlideV2','10X']
# samples=['BGI','SlideV2','scRNA']
flags=""
for i in samples:
    flags=flags+"_"+i
clusters=pd.read_csv(dirs+"gtt_clusters"+flags+".csv",header=0,index_col=0,sep=",")
ub=np.unique(clusters['batch'])
ub=samples
for i in np.arange(1,len(ub)):
    uc=np.intersect1d(clusters['clusters'][clusters['batch']==ub[0]],
                     clusters['clusters'][clusters['batch']==ub[i]])
    for clust in uc:
        embed1=pd.read_csv(dirs+"embed_"+str(ub[0])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        embed2=pd.read_csv(dirs+"embed_"+str(ub[i])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord1=pd.read_csv(dirs+"coord_"+str(ub[0])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord2=pd.read_csv(dirs+"coord_"+str(ub[i])+"_"+str(clust)+flags+".csv",header=0,index_col=0,sep=",")
        coord1=coord1.loc[embed1.index,:]
        coord2=coord2.loc[embed2.index,:]
        ###每个batch内部spot的空间距离####
#         a=np.float64(nx.from_numpy(coord1.values[:,:2]))
#         b=np.float64(nx.from_numpy(coord2.values[:,:2]))
        a=np.float64(nx.from_numpy(coord1.values))
        b=np.float64(nx.from_numpy(coord2.values))
        D1=ot.dist(a,a, metric='euclidean')
        D2=ot.dist(b,b, metric='euclidean')
        if norm:
            D1 /= nx.min(D1[D1>0])
            D2 /= nx.min(D2[D2>0])
        ####两个batch spot的低维表示的距离#####
        X1,X2 = nx.from_numpy(embed1.values), nx.from_numpy(embed2.values)
        if dissimilarity.lower()=='euclidean' or dissimilarity.lower()=='euc':
            M = ot.dist(X1,X2)
        else:
            s1 = X1 + 0.01
            s2 = X2 + 0.01
            M = kl_divergence_backend(s1, s2)
            M = nx.from_numpy(M)
        ####每个batch的spot的分布#####
        d1 = nx.ones((embed1.shape[0],))/embed1.shape[0]
        d2 = nx.ones((embed2.shape[0],))/embed2.shape[0]
        ####计算mapping#####
        constC, hC1, hC2 = ot.gromov.init_matrix(D1, D2, d1, d2, loss_fun)
        G0 = d1[:, None] * d2[None, :]
        res=ot.gromov.cg(d1, d2, (1 - alpha) * M, alpha, f, df, G0, armijo=False, C1=D1, C2=D2, constC=constC,numItermax=100000,numItermaxEmd=1e6)
        pi=pd.DataFrame(res,index=embed1.index,columns=embed2.index)
        pi.to_csv(dirs+"gwd_pi_"+str(ub[0])+"_"+str(ub[i])+"_"+str(clust)+flags+".csv")