In [124]:
import pandas as pd

In [115]:
def Levenshtein_Distance(str1, str2):
    """
    计算字符串 str1 和 str2 的编辑距离
    :param str1
    :param str2
    :return:
    """
    matrix = [[ i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
 
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            if(str1[i-1] == str2[j-1]):
                d = 0
            else:
                d = 1
            
            matrix[i][j] = min(matrix[i-1][j]+1, matrix[i][j-1]+1, matrix[i-1][j-1]+d)
 
    return matrix[len(str1)][len(str2)]

def create_gene_mapper(path):
    hs2mma = {}
    for n, line in enumerate(open(path)):
        if n == 0:
            continue
        s = line.strip().split("\t")
        if s[1].startswith("ENSMMUG"):
            continue
        elif s[0] not in hs2mma:
            hs2mma[s[0]] = s[1]
        elif s[1] == hs2mma[s[0]]:
            continue
        else:
            d1 = Levenshtein_Distance(s[0].upper(), hs2mma[s[0]].upper(), )
            d2 = Levenshtein_Distance(s[0].upper(), s[1].upper())
            if d2 < d1:
                hs2mma[s[0]] = s[1]
    return hs2mma

In [116]:
hs2mma = create_gene_mapper("./gprofiler2/human2macaque.tsv")
mm2mma = create_gene_mapper("./gprofiler2/mouse2macaque.tsv")

In [117]:
lr_database = {}

for n, line in enumerate( open("./gprofiler2/cellchatdb_human.tsv") ):
    if n == 0:
        s = line.strip().split("\t")
        continue
    
    s = line.strip().split("\t")
    interaction_id, use_name, pathway_name, annot = s[0], s[-1], s[1], s[-2]

    l, r = use_name.strip().split(" - ")
    l = l.strip()
    r = r.strip().strip("()").split("+")
    l_mma = hs2mma.get(l, None)
    r_mma = [hs2mma[i] for i in r if i in hs2mma]
    if l_mma == None or len(r_mma) == 0:
        continue
    else:
        key = f"{l_mma} - ({'+'.join(r_mma)})"
        if not key in lr_database:
            lr_database[key] = [pathway_name, annot]

print(len(lr_database))

for n, line in enumerate( open("./gprofiler2/cellchatdb_mouse.tsv") ):
    if n == 0:
        s = line.strip().split("\t")
        continue
    
    s = line.strip().split("\t")
    interaction_id, use_name, pathway_name, annot = s[0], s[-1], s[1], s[-2]

    l, r = use_name.strip().split(" - ")
    l = l.strip()
    r = r.strip().strip("()").split("+")
    
    l_mma = mm2mma.get(l, None)
    r_mma = [mm2mma[i] for i in r if i in mm2mma]
    if l_mma == None or len(r_mma) == 0:
        print(l, r, l_mma, r_mma)
        continue
    else:
        key = f"{l_mma} - ({'+'.join(r_mma)})"
        if not key in lr_database:
            lr_database[key] = [pathway_name, annot]
        #print(key)

print(len(lr_database))

1635
Bmp8a ['Acvr1', 'Acvr2a'] None ['ACVR1', 'ACVR2A']
Bmp8a ['Acvr1', 'Acvr2b'] None ['ACVR1', 'ACVR2B']
Bmp8a ['Acvr1', 'Bmpr2'] None ['ACVR1', 'BMPR2']
Bmp8a ['Bmpr1a', 'Acvr2a'] None ['BMPR1A', 'ACVR2A']
Bmp8a ['Bmpr1a', 'Acvr2b'] None ['BMPR1A', 'ACVR2B']
Bmp8a ['Bmpr1a', 'Bmpr2'] None ['BMPR1A', 'BMPR2']
Bmp8a ['Bmpr1b', 'Acvr2a'] None ['BMPR1B', 'ACVR2A']
Bmp8a ['Bmpr1b', 'Acvr2b'] None ['BMPR1B', 'ACVR2B']
Bmp8a ['Bmpr1b', 'Bmpr2'] None ['BMPR1B', 'BMPR2']
Bmp8b ['Acvr1', 'Acvr2a'] None ['ACVR1', 'ACVR2A']
Bmp8b ['Acvr1', 'Acvr2b'] None ['ACVR1', 'ACVR2B']
Bmp8b ['Acvr1', 'Bmpr2'] None ['ACVR1', 'BMPR2']
Bmp8b ['Bmpr1a', 'Acvr2a'] None ['BMPR1A', 'ACVR2A']
Bmp8b ['Bmpr1a', 'Acvr2b'] None ['BMPR1A', 'ACVR2B']
Bmp8b ['Bmpr1a', 'Bmpr2'] None ['BMPR1A', 'BMPR2']
Bmp8b ['Bmpr1b', 'Acvr2a'] None ['BMPR1B', 'ACVR2A']
Bmp8b ['Bmpr1b', 'Acvr2b'] None ['BMPR1B', 'ACVR2B']
Bmp8b ['Bmpr1b', 'Bmpr2'] None ['BMPR1B', 'BMPR2']
Inhbc ['Acvr1b', 'Acvr2a'] None ['ACVR1B', 'ACVR2A']
Inhbc ['Acvr

In [129]:
pd.DataFrame(lr_database).T.sort_values(by=[1]).to_csv("macaque_lr_database.tsv", sep="\t", header=None)

In [122]:
! less -S macaque_lr_database.tsv |awk -F '[\t]' '{print $3}' |sort |uniq -c |sort -k1,1nr

   1062 Secreted Signaling
    379 ECM-Receptor
    255 Cell-Cell Contact


In [121]:
! less -S macaque_lr_database.tsv |awk -F '[\t]' '{print $2}' |sort |uniq -c |sort -k1,1nr

    320 WNT
    176 COLLAGEN
    117 LAMININ
     62 CCL
     58 BMP
     56 FGF
     39 SEMA3
     31 CXCL
     30 THBS
     29 EPHA
     29 PARs
     28 ncWNT
     28 TENASCIN
     24 NOTCH
     18 EGF
     16 EPHB
     15 ANGPTL
     15 SLURP
     13 FN1
     13 NRG
     12 IL4
     12 JAM
     12 NODAL
     12 TGFb
     11 RLN
     11 SEMA4
     10 ACTIVIN
     10 IFN-I
     10 NECTIN
      9 IL2
      9 MK
      9 NPY
      9 NRXN
      9 OPIOID
      9 SPP1
      8 CALCR
      8 ICAM
      8 IGF
      8 IL1
      8 IL10
      8 PTN
      7 NT
      7 SEMA6
      6 DESMOSOME
      6 EDN
      6 GDF
      6 PDGF
      6 UCN
      6 VTN
      5 CLEC
      5 CNTN
      5 COMPLEMENT
      5 GDNF
      5 IL17
      5 NCAM
      5 PTH
      5 SOMATOSTATIN
      4 BMP10
      4 CD23
      4 CD40
      4 CDH
      4 GHRH
      4 IL6
      4 LT
      4 MELANOCORTIN
      4 MHC-I
      4 MHC-II
      4 MSTN
      4 NMU
      4 PROK
      4 SEMA5
      3 ANGPT
      3 AVP
      3 CD39
      