# Network analysis of DEGs from Dengue vaccine study

High and Low responders
Day 2, 4, and 7

In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import random
print(np.__version__)
print(nx.__version__)
print(pd.__version__)

import sys


1.19.2
2.5
1.1.3


# Load Interactome

In [3]:
# load STRING high confidence (>0.7) interactome
import ndex2
interactome_uuid='275bd84e-3d18-11e8-a935-0ac135e8bacf'
ndex_server='public.ndexbio.org'
ndex_user=None
ndex_password=None
G_STR = ndex2.create_nice_cx_from_server(
            ndex_server, 
            username=ndex_user, 
            password=ndex_password, 
            uuid=interactome_uuid
        ).to_networkx()
nodes = list(G_STR.nodes)

# print out interactome number of nodes and edges
print('number of nodes:')
print(len(G_STR.nodes))
print('\nnumber of edges:')
print(len(G_STR.edges))

number of nodes:
17185

number of edges:
420534


In [4]:
str_nodes = list(G_STR.nodes)
print(len(str_nodes))

17185


# Load differential expression results

In [5]:
DE_df = pd.read_csv('All_gene_list/All_H_combined.csv',index_col='X1')
print(len(DE_df))
DE_df.head()

23735


Unnamed: 0_level_0,baseMean_d2,log2FoldChange_d2,lfcSE_d2,stat_d2,pvalue_d2,padj_d2,baseMean_d4,log2FoldChange_d4,lfcSE_d4,stat_d4,...,stat_d7,pvalue_d7,padj_d7,baseMean_d92,log2FoldChange_d92,lfcSE_d92,stat_d92,pvalue_d92,padj_d92,name
X1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,62.423235,-0.358307,0.253881,-1.411319,0.15815,0.374575,68.818943,-0.20334,0.31693,-0.641593,...,-1.779694,0.075126,0.621747,64.963139,-0.643644,0.279847,-2.299981,0.021449,0.07506,TSPAN6
ENSG00000000419,2886.588467,-0.446053,0.140282,-3.179679,0.001474,0.041118,3191.354321,-0.133324,0.165481,-0.805674,...,-1.584913,0.112986,0.699833,3503.724584,-0.122433,0.192407,-0.636324,0.524565,0.683095,DPM1
ENSG00000000457,3047.608158,-0.145559,0.114969,-1.266073,0.205487,0.433983,3197.072616,-0.000606,0.116162,-0.005213,...,-0.366566,0.713943,0.973381,3260.484083,-0.20795,0.125599,-1.655667,0.097789,0.222208,SCYL3
ENSG00000000460,978.250107,-0.09861,0.0766,-1.287337,0.197977,0.425126,1069.894272,0.154703,0.093207,1.659774,...,1.214943,0.224388,0.81806,1079.672796,-0.064842,0.116255,-0.557759,0.577009,0.725165,C1orf112
ENSG00000000938,4511.025998,0.212165,0.157771,1.344763,0.178702,0.4016,3900.511416,-0.205638,0.153032,-1.343761,...,0.215744,0.829187,0.987168,5947.059683,0.670794,0.197011,3.40486,0.000662,0.005586,FGR


In [6]:
# merge in data from other comparison
DE_df_L = pd.read_csv('All_gene_list/All_L_combined.csv',index_col='X1')

DE_df = DE_df.join(DE_df_L,lsuffix='_H',rsuffix='_L')
DE_df.head()

Unnamed: 0_level_0,baseMean_d2_H,log2FoldChange_d2_H,lfcSE_d2_H,stat_d2_H,pvalue_d2_H,padj_d2_H,baseMean_d4_H,log2FoldChange_d4_H,lfcSE_d4_H,stat_d4_H,...,stat_d7_L,pvalue_d7_L,padj_d7_L,baseMean_d92_L,log2FoldChange_d92_L,lfcSE_d92_L,stat_d92_L,pvalue_d92_L,padj_d92_L,name_L
X1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,62.423235,-0.358307,0.253881,-1.411319,0.15815,0.374575,68.818943,-0.20334,0.31693,-0.641593,...,0.598048,0.549808,0.964334,53.785467,-0.135567,0.405983,-0.333924,0.738437,0.891309,TSPAN6
ENSG00000000419,2886.588467,-0.446053,0.140282,-3.179679,0.001474,0.041118,3191.354321,-0.133324,0.165481,-0.805674,...,0.3776,0.705728,0.979936,3238.472264,0.085125,0.239364,0.355631,0.722117,0.883065,DPM1
ENSG00000000457,3047.608158,-0.145559,0.114969,-1.266073,0.205487,0.433983,3197.072616,-0.000606,0.116162,-0.005213,...,0.020166,0.983911,0.999656,3210.333444,-0.209093,0.158303,-1.320841,0.186554,0.470328,SCYL3
ENSG00000000460,978.250107,-0.09861,0.0766,-1.287337,0.197977,0.425126,1069.894272,0.154703,0.093207,1.659774,...,1.235782,0.21654,0.898405,1043.23871,-0.206621,0.174221,-1.18597,0.235634,0.52684,C1orf112
ENSG00000000938,4511.025998,0.212165,0.157771,1.344763,0.178702,0.4016,3900.511416,-0.205638,0.153032,-1.343761,...,-0.335648,0.737136,0.980556,5259.872771,0.796569,0.231908,3.434846,0.000593,0.013693,FGR


In [7]:
#set index
DE_df.set_index('name_H',inplace=True)

In [8]:
# subset by genes in the interactome
# --- STRING ----
loc_temp = list(np.intersect1d(DE_df.index.tolist(),str_nodes))
print(len(loc_temp))
DE_df = DE_df.loc[loc_temp]
DE_df = DE_df.sort_values('pvalue_d2_H')
DE_df.head()

13194


Unnamed: 0_level_0,baseMean_d2_H,log2FoldChange_d2_H,lfcSE_d2_H,stat_d2_H,pvalue_d2_H,padj_d2_H,baseMean_d4_H,log2FoldChange_d4_H,lfcSE_d4_H,stat_d4_H,...,stat_d7_L,pvalue_d7_L,padj_d7_L,baseMean_d92_L,log2FoldChange_d92_L,lfcSE_d92_L,stat_d92_L,pvalue_d92_L,padj_d92_L,name_L
name_H,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MTRNR2L12,283.189169,-3.049662,0.477074,-6.392429,1.632711e-10,3e-06,434.505919,-0.010922,0.210436,-0.051903,...,-5.632087,1.780415e-08,4.3e-05,141.624681,-2.766781,0.562129,-4.921966,8.567905e-07,0.000163,MTRNR2L12
SYNJ2BP,1772.696119,-1.765707,0.297711,-5.930936,3.012124e-09,1.6e-05,2045.695393,-1.293452,0.53065,-2.437486,...,1.972453,0.0485579,0.69736,1935.908347,0.087612,0.309882,0.282726,0.7773869,0.90717,SYNJ2BP
PPP2R1A,492.594283,0.778426,0.138359,5.626131,1.842964e-08,5.5e-05,342.740624,-0.205312,0.179326,-1.144908,...,-1.943526,0.05195261,0.69976,542.984949,0.325529,0.31325,1.039198,0.2987125,0.590928,PPP2R1A
MMGT1,1253.175777,-2.49773,0.445683,-5.60427,2.091344e-08,5.5e-05,1496.836948,-1.845812,0.459542,-4.016631,...,0.347833,0.7279655,0.980089,1376.018861,-0.319524,0.474236,-0.673766,0.5004599,0.755746,MMGT1
TMEM43,3810.966317,-1.076232,0.193284,-5.568122,2.574989e-08,5.8e-05,4373.984174,-0.458713,0.274429,-1.671514,...,1.350858,0.176741,0.873095,2891.388575,0.075608,0.289864,0.260838,0.7942174,0.914828,TMEM43


In [10]:
# take the union of DEGs of H and L per day
# adjusted p-value < 0.05 and log2FC > 1

DE_sig = DE_df[(DE_df['padj_d2_H']<0.05)&(DE_df['log2FoldChange_d2_H'].abs()>1)].index.tolist()
print(len(np.unique(DE_sig)))
DE_sig = DE_sig + DE_df[(DE_df['padj_d2_L']<0.05)&(DE_df['log2FoldChange_d2_L'].abs()>1)].index.tolist()
print(len(np.unique(DE_sig)))
DE_sig = DE_sig + DE_df[(DE_df['padj_d4_H']<0.05)&(DE_df['log2FoldChange_d4_H'].abs()>1)].index.tolist()
print(len(np.unique(DE_sig)))
DE_sig = DE_sig + DE_df[(DE_df['padj_d4_L']<0.05)&(DE_df['log2FoldChange_d4_L'].abs()>1)].index.tolist()
print(len(np.unique(DE_sig)))
DE_sig = DE_sig + DE_df[(DE_df['padj_d7_H']<0.05)&(DE_df['log2FoldChange_d7_H'].abs()>1)].index.tolist()
print(len(np.unique(DE_sig)))
DE_sig = DE_sig + DE_df[(DE_df['padj_d7_L']<0.05)&(DE_df['log2FoldChange_d7_L'].abs()>1)].index.tolist()
print(len(np.unique(DE_sig)))


DE_sig = list(np.unique(DE_sig))

print(len(DE_sig))
print(', '.join(DE_sig))

121
283
305
310
371
388
388
ACADS, ACKR1, ACO1, ACSBG1, ADCK1, AHNAK, AK1, ALKBH7, ANKRD9, ARL11, ARMC6, ARMCX2, ASL, AURKAIP1, B3GNT2, B3GNT5, B3GNT8, B4GALT2, B4GALT7, BASP1, BCL11B, BICC1, BLVRB, BMP1, BOLA2B, BTG2, C5AR1, CACNA1I, CCDC134, CCL2, CCL5, CCL8, CCND2, CCR1, CCRL2, CD151, CD68, CDAN1, CDC34, CDK3, CDKN1C, CHMP1A, CHST11, CISD2, CISD3, CITED2, CMPK2, CMTM3, CMTM5, CNTF, COMMD5, COX14, COX16, CPSF1, CREB5, CRIPT, CRKL, CROCC, CSRNP1, CTSW, CTXN2, CUEDC2, CXCL10, CXCL11, CXCR5, CYB561, CYB5D1, CYBA, CYC1, CYP4F3, CYSTM1, DBI, DCHS1, DDX58, DDX60, DERL3, DGCR6L, DGKI, DMAP1, DMTN, DNAAF2, DNAJB2, DNAJC5, DTX4, DYRK1B, DYRK3, DZIP1, E2F1, EBP, EGFL8, EIF2AK2, EIF4EBP1, ELOF1, EPSTI1, ERGIC1, ETV7, EVI5L, EXOSC6, FAHD1, FAM199X, FAM220A, FAM83D, FANK1, FAP, FARP1, FAU, FBXO45, FBXO6, FBXW5, FKBP8, FKBPL, FLNA, FLT3LG, FRMD3, GBP1, GBP6, GEMIN6, GIPC1, GIT1, GNA13, GPBAR1, GPD1L, GPR153, GPR37L1, GPR84, GPS1, GPX4, GSDMD, GSE1, GSTP1, GUK1, HBA1, HBA2, HBM, HERC5, HERC6, HESX1

# Export node and edge tables for cytoscape clustering

In [11]:
# ----- STRING -----
G_sub = nx.subgraph(G_STR,DE_sig)
print(len(list(G_sub.nodes)))
print(len(list(G_sub.edges)))

388
655


In [None]:
DE_df.loc[DE_sig].to_csv('nodeLists/DE_nodeList_HandL_union_days247_210324.txt',sep='\t')

In [None]:
# write out nodelists and edgelists for overlap
z3_e1,z3_e2 = zip(*list(G_sub.edges))
edgelist_df = pd.DataFrame()
# add self edges so we don't lose nodes on import
edgelist_df['node1']=list(z3_e1)+list(G_sub.nodes)
edgelist_df['node2']=list(z3_e2)+list(G_sub.nodes)
edgelist_df.to_csv('edgeLists/DE_edgelist_HandL_union_days247_210324.txt',sep='\t')
print(len(edgelist_df))