# BioInformatics GROUP 5

## Rheumatoid arthritis-related human-oral microbiome proteins 

In [1]:
import bioservices as bi
import pandas as pd
import numpy as np
import json

In [2]:
def save_json(name, data):
    with open(name, "w") as f:
         json.dump(data, f)
    print("done.")

In [3]:
def load_json(name, data=None):
    with open(name, "r") as f:
         data = json.load(f)
    return data

In [4]:
seed_dict = load_json("../data/seed.json")

## seed symbol

In [196]:
seed=seed_dict["seed"]

## seed uniprot AC

In [195]:
seed_AC=seed_dict["seed_AC"]

## databases and mapping

In [7]:
biogrid=load_json("../data/biogrid/bioGrid.json")

In [8]:
string=load_json("../data/string/String.json")

In [9]:
apid=load_json("../data/apid/apid.json")

In [10]:
biogrid_map=load_json("../data/biogrid/bioGrid_map_ENTRZ_UNIPROT.json")

In [11]:
apid_map=load_json("../data/apid/apid_map_uniprotAC_sym.json")

In [12]:
string_map=load_json("../data/string/String_map_UniprotID_UniprotAC.json")

# 3.2) Store the data gathered from the three DBs

# protein A gene symbol, protein A Uniprot AC, interaction type, protein B gene symbol, protein B Uniprot AC, database source

In [13]:
def build_biogrid(biogrid, biogrid_map, flag=False):
    n = len(biogrid["OFFICIAL_SYMBOL_A"])
    biogrid_new = {"SYMBOL_A": [], "SYMBOL_B": [], "UNIPROT_AC_A": [], "UNIPROT_AC_B": [], "INTERACTION_TYPE": [], "SOURCE": []}
    
    biogrid_new["SYMBOL_A"]=biogrid["OFFICIAL_SYMBOL_A"]
    biogrid_new["SYMBOL_B"]=biogrid["OFFICIAL_SYMBOL_B"]
    biogrid_new["INTERACTION_TYPE"]=biogrid["EXPERIMENTAL_SYSTEM_TYPE"]
    
    biogrid_new["UNIPROT_AC_A"] = [biogrid_map.get(i) for i in biogrid["ENTREZ_GENE_A"]]
    biogrid_new["UNIPROT_AC_B"] = [biogrid_map.get(i) for i in biogrid["ENTREZ_GENE_B"]]
    biogrid_new["SOURCE"] = ["BIOGRID" for i in range(n)]
    
    biogrid_df=pd.DataFrame(biogrid_new, columns=["SYMBOL_A", "UNIPROT_AC_A", "INTERACTION_TYPE", "SYMBOL_B", "UNIPROT_AC_B", "SOURCE"])
    
    if flag:
        save_json("./data/biogrid/BIOGRID.json", biogrid_new)
        biogrid_df.to_csv("./data/biogrid/BIOGRID.csv")
    return biogrid_df

In [14]:
biogrid_df=build_biogrid(biogrid, biogrid_map)

In [30]:
def build_string(string, string_map, flag=False):
    n = len(string["preferredName_A"])
    string_new = {"SYMBOL_A": [], "SYMBOL_B": [], "UNIPROT_AC_A": [], "UNIPROT_AC_B": [], "INTERACTION_TYPE": [], "SOURCE": []}
    
    string_new["SYMBOL_A"]=string["preferredName_A"]
    string_new["SYMBOL_B"]=string["preferredName_B"]
    string_new["INTERACTION_TYPE"]=[None for i in range(n)]
    
    string_new["UNIPROT_AC_A"] = [string_map.get(i) for i in string["preferredName_A"]]
    string_new["UNIPROT_AC_B"] = [string_map.get(i) for i in string["preferredName_B"]]
    string_new["SOURCE"] = ["STRING" for i in range(n)]
    
    string_df=pd.DataFrame(string_new, columns=["SYMBOL_A", "UNIPROT_AC_A", "INTERACTION_TYPE", "SYMBOL_B", "UNIPROT_AC_B", "SOURCE"])
    
    if flag:
        save_json("./data/string/STRING.json", string_new)
        string_df.to_csv("./data/string/STRING.csv")
    return string_df

In [16]:
mapping_string=load_json("../data/string/String_map_UniprotID_UniprotAC.json")

In [31]:
string_df=build_string(string, mapping_string)

In [59]:
def build_apid(apid, apid_map, flag=False):
    apid_A=[]
    apid_B=[]
    for k in apid:
        for i in apid[k]:
            apid_A.append(k)
            apid_B.append(i)
    sym_A=[apid_map.get(i) for i in apid_A]
    sym_B=[apid_map.get(i) for i in apid_B]
    
    n = len(apid_A)
    apid_new = {"SYMBOL_A": [], "SYMBOL_B": [], "UNIPROT_AC_A": [], "UNIPROT_AC_B": [], "INTERACTION_TYPE": [], "SOURCE": []}
    
    apid_new["SYMBOL_A"]=sym_A
    apid_new["SYMBOL_B"]=sym_B
    apid_new["INTERACTION_TYPE"]=[None for i in range(n)]
    
    apid_new["UNIPROT_AC_A"] = apid_A
    apid_new["UNIPROT_AC_B"] = apid_B
    apid_new["SOURCE"] = ["APID" for i in range(n)]
    
    apid_df=pd.DataFrame(apid_new, columns=["SYMBOL_A", "UNIPROT_AC_A", "INTERACTION_TYPE", "SYMBOL_B", "UNIPROT_AC_B", "SOURCE"])
    
    if flag:
        save_json("./data/apid/APID.json", apid_new)
        apid_df.to_csv("./data/apid/APID.csv")
    return apid_df

In [181]:
apid_df = build_apid(apid, apid_map)

In [198]:
databases = {"biogrid": biogrid_df, "string": string_df, "apid": apid_df}

In [199]:
biogrid_df.head()

Unnamed: 0,SYMBOL_A,UNIPROT_AC_A,INTERACTION_TYPE,SYMBOL_B,UNIPROT_AC_B,SOURCE
0,CDKN2A,G3XAG3,physical,PCNA,P12004,BIOGRID
1,RUVBL1,B3KRS7,physical,H2AFX,P16104,BIOGRID
2,ABI1,A0A0A0MRT6,physical,ENAH,Q8N8S7,BIOGRID
3,HMOX2,A0A087WT44,physical,PTRH2,J3KQ48,BIOGRID
4,CUL7,Q14999,physical,SKP1,P63208,BIOGRID


In [200]:
apid_df.head()

Unnamed: 0,SYMBOL_A,UNIPROT_AC_A,INTERACTION_TYPE,SYMBOL_B,UNIPROT_AC_B,SOURCE
0,CUL2,Q13617,,Q5NGN2,Q5NGN2,APID
1,CUL2,Q13617,,CAND1,Q86VP6,APID
2,CUL2,Q13617,,LLR1,Q96L50,APID
3,CUL2,Q13617,,ELOC,Q15369,APID
4,CUL2,Q13617,,RBX1,P62877,APID


In [201]:
string_df.head()

Unnamed: 0,SYMBOL_A,UNIPROT_AC_A,INTERACTION_TYPE,SYMBOL_B,UNIPROT_AC_B,SOURCE
0,VAMP3,Q15836,,ARF5,P84085,STRING
1,PLEK,P08567,,ARF5,P84085,STRING
2,ZAP70,P43403,,PLEK,P08567,STRING
3,ARF1,P84077,,ARF5,P84085,STRING
4,ARF1,P84077,,PLEK,P08567,STRING


## 3.3)

In [202]:
def check_seed_AC(data, seed_AC):
    s = set(seed_AC)
    set_data=set(data["UNIPROT_AC_A"]).union(set(data["UNIPROT_AC_B"]))
    logic=len(set_data.intersection(s)) == len(seed_AC)
    if logic:
        print(len(set_data.intersection(s)))
    return logic

def check_seed_symb(data, seed):
    s = set(seed)
    set_data=set(data["SYMBOL_A"]).union(set(data["SYMBOL_B"]))
    logic=len(set_data.intersection(s)) == len(seed)
    if logic:
        print(len(set_data.intersection(s)))
    return logic

def check_seed(data, seed, seed_AC):
    logic1=check_seed_symb(data, seed)
    logic2=check_seed_AC(data, seed_AC)
    if logic1 == True:
        print("Symbol")
    if logic2 == True:
        print("Uniprot_AC")
    return logic1 or logic2

## no. of seed genes found in each different DBs

In [203]:
for d in databases:
    print(d + ": ", check_seed(databases[d], seed, seed_AC))

54
Symbol
biogrid:  True
54
Symbol
string:  True
54
Uniprot_AC
apid:  True


## total no. of interacting proteins, including seed genes, for each DB;

In [189]:
def check_set_genes(data, flag="symbol"):
    
    if flag=="symbol":
        tot=set(data["SYMBOL_A"]).union(set(data["SYMBOL_B"]))
    elif flag=="uniprot_AC":
        tot=set(data["UNIPROT_AC_A"]).union(set(data["UNIPROT_AC_B"]))
    return tot

In [204]:
for d in databases:
    print(d + ": ", len(check_set_genes(databases[d])), "symbol")

biogrid:  3898 symbol
string:  6019 symbol
apid:  4505 symbol


In [205]:
for d in databases:
    print(d + ": ", len(check_set_genes(databases[d])), "uniprot_AC")

biogrid:  3898 uniprot_AC
string:  6019 uniprot_AC
apid:  4505 uniprot_AC


## total no. of interactions found in each DB (without repetition)

In [206]:
def check_gene_interaction(data, symA, symB, seed):

    A=data[symA]
    B=data[symB]
    interactions=list(zip(A,B))

    interactions_set = set()
    interactions_set_seed = set()

    for t in interactions:
        if t[0] != t[1]:
            l1=(t[0], t[1])
            l2=(t[1], t[0])

            if ( (l1 not in interactions_set) and (l2 not in interactions_set) ): 
                interactions_set.add(l1)

            if t[0] in seed or t[1] in seed:
                if ( (l1 not in interactions_set_seed) and (l2 not in interactions_set_seed) ): 
                    interactions_set_seed.add(l1)
    return interactions_set, interactions_set_seed

In [207]:
for d in databases:
    interactions_set, interactions_set_seed=check_gene_interaction(databases[d], "SYMBOL_A", "SYMBOL_B", seed)
    
    print("interactions " + d + ": ", len(interactions_set))
    print("interactions with seed " + d + ": ", len(interactions_set_seed))
    print()

interactions biogrid:  36301
interactions with seed biogrid:  5834

interactions string:  275510
interactions with seed string:  11994

interactions apid:  7219
interactions with seed apid:  4293



In [208]:
for d in databases:
    interactions_set, interactions_set_seed=check_gene_interaction(databases[d], "UNIPROT_AC_A", "UNIPROT_AC_B", seed_AC)
    
    print("interactions " + d + ": ", len(interactions_set))
    print("interactions with seed " + d + ": ", len(interactions_set_seed))
    print()

interactions biogrid:  36021
interactions with seed biogrid:  2629

interactions string:  256316
interactions with seed string:  10418

interactions apid:  7463
interactions with seed apid:  7463



### build new databases with only rows where there is at least a gene seed

In [212]:
def retrieve_indices(data, seed_AC, col1="UNIPROT_AC_A", col2="UNIPROT_AC_B"):

    idx = []
    for i in seed_AC:
        ix = data[col1].index[data[col1] == i].tolist()
        idx.extend(ix)
        
    for i in seed_AC:
        ix = data[col2].index[data[col2] == i].tolist()
        idx.extend(ix)
        
    idx = sorted(set(idx))
    return idx

In [213]:
def build_databases_seeds(databases):
    '''compute databases with only interactions with at least one seed gene'''
    seed_databases = {}

    for d in databases:
        idx=retrieve_indices(databases[d], seed_AC)

        subsample = databases[d].iloc[idx]
        print(d)
        seed_databases[d] = subsample
    return seed_databases

In [214]:
seed_databases=build_databases_seeds(databases)

biogrid
string
apid


In [215]:
for d in seed_databases:
    print(d + ": ", len(seed_databases[d]))

biogrid:  4626
string:  23285
apid:  7537


### concat all databases 

In [216]:
res=[seed_databases["biogrid"], seed_databases["string"], seed_databases["apid"]]

In [217]:
whole_database=pd.concat(res, ignore_index=True)

##  4.1) interactome

In [218]:
def f_interactome(new, seed_AC, col1="UNIPROT_AC_A", col2="UNIPROT_AC_B"):
    '''
    only seed genes interactions
    '''
    A = new[col1]
    B = new[col2]
    
    new_index = []
    new_set= set()
    count=0
    for i in range(len(A)):
        l1=(A[i], B[i])
        l2=(B[i], A[i])
        if A[i] in seed_AC and B[i] in seed_AC and A[i] != B[i]:
            count += 1
            if l1 not in new_set and l2 not in new_set:
                new_set.add(l1)
                new_index.append(i)
    return new_index

In [219]:
new_index=f_interactome(whole_database, seed_AC)

In [220]:
interactome=whole_database.iloc[new_index]

In [None]:
#interactome.to_csv("./data/interactions/interactome.csv")

In [221]:
len(interactome)

80

## 4.2) union interactome

In [227]:
def f_union_interactome(new, seed_AC, col1="UNIPROT_AC_A", col2="UNIPROT_AC_B"):
    '''
    union interaction with at least one seed gene
    '''
    A = new[col1]
    B = new[col2]
    
    new_index = []
    new_set= set()
    count=0
    for i in range(len(A)):
        if A[i] != B[i]:
            l1=(A[i], B[i])
            l2=(B[i], A[i])
            if A[i] in seed_AC or B[i] in seed_AC:
                count += 1
                if l1 not in new_set and l2 not in new_set:
                    new_set.add(l1)
                    new_index.append(i)
                
    return new_index

In [228]:
new_index=f_union_interactome(whole_database, seed_AC)

In [229]:
union_interactome=whole_database.iloc[new_index]

In [230]:
len(union_interactome)

16603

In [231]:
union_interactome.head()

Unnamed: 0,SYMBOL_A,UNIPROT_AC_A,INTERACTION_TYPE,SYMBOL_B,UNIPROT_AC_B,SOURCE
0,ABI1,A0A0A0MRT6,physical,ENAH,Q8N8S7,BIOGRID
1,CUL7,Q14999,physical,SKP1,P63208,BIOGRID
2,SUV39H1,O43463,physical,RBBP4,Q09028,BIOGRID
3,LMNA,P02545,physical,PMS2,B4DGM0,BIOGRID
4,NASP,P49321,physical,RBBP4,Q09028,BIOGRID


## 4.3) intersection interactome

In [234]:
def f_intersection_interactome(res, seed_AC, col1="UNIPROT_AC_A", col2="UNIPROT_AC_B"):
    
    '''intersection interaction with at least one seed gene ------ to improve'''

    new_dict = {}
    intersection_df = pd.DataFrame(columns=res[0].columns)
    for r in res:
        temp = set()
        database=pd.DataFrame(r.values,columns=r.columns)
        new_index=f_union_interactome(database, seed_AC)
        A=database[col1].values
        B=database[col2].values
        for i in new_index:
            if A[i] != B[i]:
                l1=(A[i], B[i])
                l2=(B[i], A[i])

                if l1 not in temp and l2 not in temp:
                    temp.add(l1)
                    new_dict[l1] = new_dict.get(l1, 0) + 1 
                    if new_dict[l1] == len(res):
                        t=database.iloc[i]
                        intersection_df=intersection_df.append(t,ignore_index=True)

    return intersection_df

In [235]:
intersection_interactome=f_intersection_interactome(res, seed_AC)

In [239]:
len(intersection_interactome)

105

In [237]:
intersection_interactome.head()

Unnamed: 0,SYMBOL_A,UNIPROT_AC_A,INTERACTION_TYPE,SYMBOL_B,UNIPROT_AC_B,SOURCE
0,CALR,P27797,,K1C17,Q04695,APID
1,CALR,P27797,,MARE3,Q9UPY8,APID
2,CALR,P27797,,PERF,P14222,APID
3,CALR,P27797,,VWF,P04275,APID
4,CALR,P27797,,CALX,P27824,APID


# enrichment

In [252]:
import json
import requests

def retrieveUserListId(sym_genes):
    ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/addList'
    genes_str = '\n'.join(sym_genes)
    description = 'Example gene list'
    payload = {
        'list': (None, genes_str),
        'description': (None, description)
    }
    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')
        
    data = json.loads(response.text)
    return data

In [255]:
def retrieveGO(data):
    ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/enrich'
    query_string = '?userListId=%s&backgroundType=%s'
    user_list_id = data['userListId']
    db = 'GO_Biological_Process_2017b'
    response = requests.get(
        ENRICHR_URL + query_string % (user_list_id, db)
     )
    if not response.ok:
        raise Exception('Error fetching enrichment results')

    GO = json.loads(response.text)
    return GO

In [258]:
def retrievePathway(data):
    ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/enrich'
    query_string = '?userListId=%s&backgroundType=%s'
    user_list_id = data['userListId']
    db = 'KEGG_2016'
    response = requests.get(
        ENRICHR_URL + query_string % (user_list_id, db)
     )
    if not response.ok:
        raise Exception('Error fetching enrichment results')

    path = json.loads(response.text)
    return path

In [292]:
def routineGO(data, module_index=[], clustering_method=[]):
    GO=retrieveGO(data)
    GO=GO["GO_Biological_Process_2017b"][:10]
    GO=[go[:6] for go in GO]
    GO_df=pd.DataFrame(GO)
    GO_df.columns = ["Rank", "Term name", "P-value", "Z-score", "Combined score", "Overlapping genes"]
#     if module_index != [] and clustering_method != []:
#         GO_df["module index"] = module_index
#         GO_df["clustering method"] = clustering_method
    return GO_df

In [326]:
intersection=set(intersection_interactome["SYMBOL_A"].values)

In [327]:
intersection=intersection.union(set(intersection_interactome["SYMBOL_B"].values))

In [328]:
intersection=list(intersection)

In [329]:
len(intersection)

114

In [330]:
data=retrieveUserListId(intersection)

In [331]:
GO_df=routineGO(data)

In [332]:
GO_df.to_csv("./go_intersection.csv", index=None)

In [333]:
GO_df

Unnamed: 0,Rank,Term name,P-value,Z-score,Combined score,Overlapping genes
0,1,positive regulation of DNA repair by positive ...,8e-05,-6.554776,61.820955,"[HDAC5, PCNA, STAT1, STAT3, FOS, BRCC3, KAT2B,..."
1,2,positive regulation of transcription from RNA ...,0.000351,-7.562705,60.151745,"[HDAC5, NCF1, STAT1, STAT2, ANXA4, STAT3, FOS,..."
2,3,chromatin-mediated maintenance of transcriptio...,7.7e-05,-5.519284,52.288773,"[KAT2B, HDAC5, NCF1, RBBP4, STAT1, IRF1, STAT3..."
3,4,negative regulation of apoptotic process (GO:0...,7.4e-05,-5.39821,51.33727,"[STAT1, SRC, ANXA4, TRAF6, STAT3, UBC, FLNA, E..."
4,5,canonical Wnt signaling pathway involved in ne...,5.3e-05,-5.072577,49.983296,"[SRC, ANXA4, TRAF6, STAT3, UBC, DVL2, FLNA, ER..."
5,6,positive regulation of telomeric RNA transcrip...,0.000833,-6.252257,44.328312,"[KAT2B, HDAC5, STAT1, TRAF6, MYOD1, IRF1, STAT..."
6,7,positive regulation of mating type switching b...,0.000833,-6.250891,44.318627,"[KAT2B, HDAC5, STAT1, TRAF6, MYOD1, IRF1, STAT..."
7,8,positive regulation of transcription from RNA ...,0.000833,-6.239939,44.24098,"[KAT2B, HDAC5, STAT1, TRAF6, MYOD1, IRF1, STAT..."
8,9,positive regulation of transcription from RNA ...,0.000833,-6.237291,44.222209,"[KAT2B, HDAC5, STAT1, TRAF6, MYOD1, IRF1, STAT..."
9,10,positive regulation of pseudohyphal growth by ...,0.000833,-6.236262,44.214911,"[KAT2B, HDAC5, STAT1, TRAF6, MYOD1, IRF1, STAT..."


In [299]:
def routinePathway(data, module_index=[], clustering_method=[]):
    pathway=retrievePathway(data)
    pathway=pathway["KEGG_2016"][:10]
    pathway=[path[:6] for path in pathway]
    pathway_df=pd.DataFrame(pathway)
    pathway_df.columns = ["Rank", 'name', 'P-value', 'Z-score', 'Combined score', "overlapping genes"]
#     if module_index != [] and clustering_method != []:
#         pathway_df["module index"] = module_index
#         pathway_df["clustering method"] = clustering_method
    return pathway_df

In [334]:
pathway_df=routinePathway(data)

In [335]:
pathway_df.to_csv("./path_intersection.csv", index=None)