## Part 2.4

### Import data and modules

In [1]:
import numpy as np
import networkx as nx
import pandas as pd
from scipy.stats import hypergeom
import gseapy
import re

In [2]:
df_seed_genes = pd.read_csv("Seed_Genes.csv", sep = "\t")
df_seed_genes.head()

Unnamed: 0,seed Gene,Uniprot_AC,protein_name,entrez_gene_id,description
0,IFNG,P01579,Interferon gamma,3458,Produced by lymphocytes activated by specific ...
1,CD28,P10747,T-cell-specific surface glycoprotein CD28,940,"Involved in T-cell activation, the induction o..."
2,HLA-B,P01889,"HLA class I histocompatibility antigen, B alph...",3106,Antigen-presenting major histocompatibility co...
3,KIR3DL1,P43629,Killer cell immunoglobulin-like receptor 3DL1,3811,Receptor on natural killer (NK) cells for HLA ...
4,HLA-C,P10321,"HLA class I histocompatibility antigen, C alph...",3107,Antigen-presenting major histocompatibility co...


In [3]:
df_seed_interactome = pd.read_csv("SeedGeneInteractome.csv", sep = "\t")
df_seed_interactome.head()

Unnamed: 0,interactor A gene symbol,interactor B gene symbol,interactor A Uniprot AC,interactor B Uniprot AC,database source
0,IFNG,IFNG,P01579,P01579,Biogrid
1,HLA-B,HLA-B,P01889,P01889,Biogrid
2,HLA-B,ADRB2,P01889,P07550,Biogrid
3,HLA-B,HLA-C,P01889,P10321,Biogrid
4,KIR3DL1,KIR3DL1,P43629,P43629,Biogrid


In [4]:
df_union_interactome = pd.read_csv("UnionInteractome.csv", sep = "\t")
df_union_interactome.head()

Unnamed: 0,interactor A gene symbol,interactor B gene symbol,interactor A Uniprot AC,interactor B Uniprot AC
0,IFNG,RP3-503F13.3,P01579,-
1,IFNG,GOPC,P01579,Q9HD26
2,IFNG,IFNGR2,P01579,P38484
3,IFNG,STAT6,P01579,P42226
4,IFNG,IFNG,P01579,P01579


In [5]:
df_intersection_interactome = pd.read_csv("IntersectionInteractome.csv", sep = "\t")
df_intersection_interactome.head()

Unnamed: 0,interactor A gene symbol,interactor B gene symbol,interactor A Uniprot AC,interactor B Uniprot AC
0,DHX15,ARRB2,O43143,P32121
1,CAD,HDAC6,P27708,Q9UBN7
2,RPL7,RPS13,P18124,P62277
3,MYC,HSP90AA1,P01106,P07900
4,TP53,GNL3,P04637,Q9BVP2


### Find putative disease genes using the DIAMOnD tool

In [6]:
# open complete interactome
df_Biogrid = pd.read_csv("SummarizedData.csv", sep = "\t")
df_Biogrid.head()

Unnamed: 0,interactor A gene symbol,interactor B gene symbol,interactor A Uniprot AC,interactor B Uniprot AC,database source
0,IFNG,RP3-503F13.3,P01579,-,Biogrid
1,IFNG,GOPC,P01579,Q9HD26,Biogrid
2,IFNG,IFNGR2,P01579,P38484,Biogrid
3,IFNG,STAT6,P01579,P42226,Biogrid
4,IFNG,IFNG,P01579,P01579,Biogrid


In [7]:
# get complete BioGRID interactome
df_Biogrid = df_Biogrid[df_Biogrid["database source"] == "Biogrid"]
df_Biogrid.head()

Unnamed: 0,interactor A gene symbol,interactor B gene symbol,interactor A Uniprot AC,interactor B Uniprot AC,database source
0,IFNG,RP3-503F13.3,P01579,-,Biogrid
1,IFNG,GOPC,P01579,Q9HD26,Biogrid
2,IFNG,IFNGR2,P01579,P38484,Biogrid
3,IFNG,STAT6,P01579,P42226,Biogrid
4,IFNG,IFNG,P01579,P01579,Biogrid


In [8]:
# create network file for DIAMOnD
with open('DIAMOnD/network_file_DIAMOnD', 'w', newline='') as f:
    for idx in range(len(df_Biogrid)):
        f.write('{} {}\n'.format(df_Biogrid["interactor A gene symbol"][idx], df_Biogrid["interactor B gene symbol"][idx]))
    f.close()

In [9]:
# create seed file for DIAMOnD
with open('DIAMOnD/seed_file_DIAMOnD', 'w', newline='') as f:
    for idx in range(len(df_seed_genes)):
        f.write('{}\n'.format(df_seed_genes["seed Gene"][idx]))
    f.close()

In [10]:
# execute DIAMOnD algorithm
!python2 DIAMOnD.py "DIAMOnD/network_file_DIAMOnD" "DIAMOnD/seed_file_DIAMOnD" 200 1 "DIAMOnD/output_DIAMOnD"

DIAMOnD(): ignoring 1 of 102 seed genes that are not in the network

 results have been saved to 'DIAMOnD/output_DIAMOnD' 



In [11]:
# open DIAMOnD output
f = open("DIAMOnD/output_DIAMOnD")
lines = f.read()
lines = lines.split("\n")
f.close()
lines = lines[1:len(lines) - 1]

In [12]:
# get putative disease genes
disease_genes = []
for line in lines:
    dg = re.search("[\t].+", line)[0].replace("\t", "")
    disease_genes.append(dg)

In [13]:
disease_genes[0:5]

['RP3-339A18.4', 'HSPC057', 'RP3-330O12.3', 'KB-152G3.1', 'RP1-319D22.1']

In [14]:
# perform enrichment analysis
gseapy.enrichr(gene_list=disease_genes, description='gene_ontology', gene_sets='GO_Biological_Process_2018', outdir='GO/putative_disease_genes',cutoff=0.05, format='png')
gseapy.enrichr(gene_list=disease_genes, description='gene_ontology', gene_sets='GO_Cellular_Component_2018', outdir='GO/putative_disease_genes',cutoff=0.05, format='png')
gseapy.enrichr(gene_list=disease_genes, description='gene_ontology', gene_sets='GO_Molecular_Function_2018', outdir='GO/putative_disease_genes',cutoff=0.05, format='png')
gseapy.enrichr(gene_list=disease_genes, description='pathway', gene_sets='KEGG_2019_Human', outdir='GO/putative_disease_genes',cutoff=0.05, format='png')

<gseapy.enrichr.Enrichr at 0x7fce9425d160>

In [15]:
# save first 30 putative disease genes
with open('putative_disease_genes.txt', 'w', newline='') as f:
    for gene in disease_genes[0:30]:
        f.write('{}\n'.format(gene))
    f.close()

In [18]:
# save all of the putative disease genes
with open('putative_disease_genes_tot.txt', 'w', newline='') as f:
    for gene in disease_genes:
        f.write('{}\n'.format(gene))
    f.close()