# BioInformatics GROUP 5

## Rheumatoid arthritis-related human-oral microbiome proteins 

### [bioservices docs](http://bioservices.readthedocs.io/en/master/references.html)

### [fasta format](https://en.wikipedia.org/wiki/FASTA_format)

In [58]:
from multiprocessing.pool import ThreadPool
import bioservices as bi
import pandas as pd
import numpy as np
import math

In [3]:
seed = 'ACAP2, ADAP1, AGFG1, ANXA4, AURKC, BCCIP, BECN1, BDKRB2, BRCA2, CALR, CAV1, CAPZA2, CDC42, CDK1, CLTC, CUL2, DCLK1, DMPK, POLD1, \\
        EIF2AK2, ENAH, BRIP1, GGA2, GNPNAT1, HSF4, IFIT3, ILF3, ITGAV, KIF3A, KATNB1, LMNA, MAP3K8, MEF2D, MAPK3, MAP2K4, NDUFA13, PPP3CA, \\
        PTPN1, RABGEF1, RAP2A, RAB27A, RBBP4, SERINC3, SKP1, SORT1, SPI1, SQSTM1, SRF, STAT1, STAT3, STAT6, TUBA4A, FASLG, UBE2L3'

In [4]:
def clean(seed):
    seed = seed.strip().split(", ")
    new_seed = []
    for s in range(len(seed)):

        ss = seed[s]
        if len(ss.split("\\")) > 1:
            new_seed.append(ss.split("\\")[1].strip())
        else:
            new_seed.append(ss)
    seed = new_seed[:]
    return seed

seed = clean(seed)

## 1) seed genes list

In [5]:
print(seed)

['ACAP2', 'ADAP1', 'AGFG1', 'ANXA4', 'AURKC', 'BCCIP', 'BECN1', 'BDKRB2', 'BRCA2', 'CALR', 'CAV1', 'CAPZA2', 'CDC42', 'CDK1', 'CLTC', 'CUL2', 'DCLK1', 'DMPK', 'POLD1', 'EIF2AK2', 'ENAH', 'BRIP1', 'GGA2', 'GNPNAT1', 'HSF4', 'IFIT3', 'ILF3', 'ITGAV', 'KIF3A', 'KATNB1', 'LMNA', 'MAP3K8', 'MEF2D', 'MAPK3', 'MAP2K4', 'NDUFA13', 'PPP3CA', 'PTPN1', 'RABGEF1', 'RAP2A', 'RAB27A', 'RBBP4', 'SERINC3', 'SKP1', 'SORT1', 'SPI1', 'SQSTM1', 'SRF', 'STAT1', 'STAT3', 'STAT6', 'TUBA4A', 'FASLG', 'UBE2L3']


In [5]:
len(seed)

54

## 2) Collect basic information about seed genes

#### extract symbol on [HGNC](https://www.genenames.org/), [Uniprot](http://www.uniprot.org/) AC, [Uniprot](http://www.uniprot.org/) ID, protein name, [Entrez](https://www.ncbi.nlm.nih.gov/Class/MLACourse/Original8Hour/Entrez/) Gene ID, description

In [None]:
data = {'symbols':[],'uniprot_AC':[],'entrez_ID':[],'uniprot_ID':[],'uniprot_RefName':[],'description':[]}

In [7]:
hgnc = bi.HGNC()

In [59]:
u = bi.UniProt(verbose = False)

In [8]:
def extract_sym_AC_entrezID(data, seed):
    '''extract uniprot AC and entrez ID for seed genes'''
    
    for i in range(len(seed)):
        dictio = hgnc.fetch('symbol', seed[i])
        new_dictio = dictio['response']['docs'][0]
        data['symbols'].append(new_dictio['symbol'])
        data['uniprot_AC'].append(new_dictio['uniprot_ids'][0])
        data['entrez_ID'].append(new_dictio['entrez_id'])
    return data

In [None]:
data = extract_sym_AC_entrezID(data, seed)

In [None]:
print(data["symbols"][:4])

In [None]:
print(data["uniprot_AC"][:4])

In [None]:
print(data["entrez_ID"][:4])

In [None]:
def check_symbol(data, seed):
    logic = True
    for i in range(len(seed)):
        if seed[i] == data['symbols'][i]:
            continue
        else:
            print('Error for ', seed[i])
            logic = False
    if logic == True:
        print('All seed and symbols are updated')
    return None

In [None]:
check_symbol(data, seed)

In [None]:
def routine(pro, string):
    res = pro[string][0].contents[0]
    return res

def extract_ID_refName(data, seed):
    for i in range(len(seed)):
        
        pro = u.retrieve(data['uniprot_AC'][i], frmt="xml")
        AC = routine(pro, "accession")
        ID = routine(pro, "name")
        refName = routine(pro, "fullname")
        
        if str(AC) == data["uniprot_AC"][i]:
            data['uniprot_ID'].append(str(ID))
            data['uniprot_RefName'].append(str(refName))
        else:
            print("problem with " + AC)
    return data

In [None]:
data = extract_ID_refName(data, seed)

In [None]:
print(data["uniprot_ID"][:4])

In [None]:
print(data["uniprot_RefName"][:4])

In [6]:
def save_json(name, data):
    import json
    with open(name, "w") as f:
         json.dump(data, f)
    print("done.")
            
def load_json(name, data=None):
    import json
    with open(name, "r") as f:
         data = json.load(f)
    return data

## 3) Collect interaction data

### collect all binary protein interactions

###  from [APID](http://cicblade.dep.usal.es:8080/APID/init.action) Level2 human [interactome](https://en.wikipedia.org/wiki/Interactome), [Biogrid](https://wiki.thebiogrid.org/doku.php/biogridrest#list_of_parameters) Human v.3.4.154, [String](https://string-db.org/)

In [199]:
import requests  
from bs4 import BeautifulSoup as bs

In [200]:
with open("bioGridKey.txt") as f:
    biogrid_key = f.read()[:-1]

In [205]:
def biogrid_routine(seed, start='0', key=None):
    human = '9606'
    url = 'http://webservice.thebiogrid.org/'
    query_version = url + 'version/?accessKey=' + key
    
    version = requests.get(query_version).text
    if version == '3.4.154' and start=='0':
        print("Right Database Version")
        
    geneList = ''
    for s in seed:
        geneList += s + '|'
    geneList = geneList[:-1]
    
    query_params = '&searchNames=true&start='+start+'&max=1000000&includeInteractors=true&includeInteractorInteractions=true&taxId='+human+'&format=json&accesskey='
    queryList = 'geneList=' + geneList
    query_biogrid= url + 'interactions/?' + queryList + query_params + biogrid_key
    
    r= requests.get(query_biogrid)
    if r.status_code == 200:
        r = r.json()
        
    return r

In [213]:
def biogrid_api(seed, key=biogrid_key):
    res = {}
    r = biogrid_routine(seed, start="0", key=biogrid_key)
    res.update(r)
    t=0
    while len(r) == 10000:
        t+=1 
        start=str(int(10000*t))
        time.sleep(1)
        r = biogrid_routine(seed, start=start, key=biogrid_key)
        print(t)
        res.update(r)
    return res

In [214]:
import time
res = biogrid_api(seed)

1
2
3
4
5
6


In [215]:
def bioGrid_process(res):

    bioGrid= {'BIOGRID_ID_A': [], 'BIOGRID_ID_B': [], 'BIOGRID_INTERACTION_ID': [], 
              'ENTREZ_GENE_A': [], 'ENTREZ_GENE_B': [], 'EXPERIMENTAL_SYSTEM': [], 
              'OFFICIAL_SYMBOL_A': [], 'OFFICIAL_SYMBOL_B': [], 'THROUGHPUT': [], 'EXPERIMENTAL_SYSTEM_TYPE': []}
    
    for key in res:
        interaction = res[key]
        bioGrid['BIOGRID_ID_A'].append(interaction['BIOGRID_ID_A'])
        bioGrid['BIOGRID_ID_B'].append(interaction['BIOGRID_ID_B'])
        bioGrid['BIOGRID_INTERACTION_ID'].append(interaction['BIOGRID_INTERACTION_ID'])
        bioGrid['ENTREZ_GENE_A'].append(interaction['ENTREZ_GENE_A'])
        bioGrid['ENTREZ_GENE_B'].append(interaction['ENTREZ_GENE_B'])
        bioGrid['EXPERIMENTAL_SYSTEM'].append(interaction['EXPERIMENTAL_SYSTEM'])
        bioGrid['OFFICIAL_SYMBOL_A'].append(interaction['OFFICIAL_SYMBOL_A'])
        bioGrid['OFFICIAL_SYMBOL_B'].append(interaction['OFFICIAL_SYMBOL_B'])
        bioGrid['THROUGHPUT'].append(interaction['THROUGHPUT'])
        bioGrid['EXPERIMENTAL_SYSTEM_TYPE'].append(interaction['EXPERIMENTAL_SYSTEM_TYPE'])
    return bioGrid

In [216]:
bioGrid = bioGrid_process(res)

### mapping for String

In [11]:
def buildStringRequest(seed, method, url="http://string-db.org/api", f="/json/"):
    species = "9606"
    identifier  = "SapienzaBioInformatics"
    request_url = url + f + method + "?identifiers="
    if method == "resolve":
        request_url += seed
    else:
        for gene in seed:
            request_url += "%0D" + str(gene)
    request_url += "&" + "species=" + species 
    request_url += "&" + "caller_identity=" + identifier
    return request_url

In [12]:
def StringMapping(seed):
    method = "resolve"
    StringIDs = []
    annotations = []
    for gene in seed:
        request_url = buildStringRequest(gene, method)
        r = requests.get(request_url)
        #print(request_url)
        #print(gene)
        #print(r.json()[0]["stringId"])
        if r.status_code == 200:
            r = r.json()[0]
            StringIDs.append(r["stringId"])
            annotations.append(r["annotation"])
            
    String={"StringIDs": StringIDs, "annotations": annotations}
    return String

In [14]:
String_map = StringMapping(seed)

In [None]:
def StringEnrichment(StringIDs, method):
    url = buildStringRequest(StringIDs, method)
    print(url)
    r = requests.get(url)
    if r.status_code == 200:
        r = r.json()[0]
    return r 

In [None]:
enrichment = StringEnrichment(StringIDs, "enrichment")

In [None]:
ppi_enrichment = StringEnrichment(StringIDs, "ppi_enrichment")

### [Network Interactions in String](http://string-gamma.org/cgi/help.pl?subpage=api%23getting-the-string-network-interactions)

In [None]:
#http://string-db.org/api/json/network?identifiers=DRD1_HUMAN%0dDRD2_HUMAN&add_nodes=100000

### this query is too demanding (error 524)...we have to split

In [23]:
from time import sleep

def StringInteraction(String, method):
    d = {"stringId_A": [], "stringId_B": [], "preferredName_A": [], "preferredName_B": []}
    genes = String["StringIDs"]
    #Str = String["StringIDs"]
    #     seed = []
#     Str_par = []
#     for i in range(n):
#         for j in range((i+1),n):
#             seed.append((Str[i], Str[j]))
#     print(len(seed))
#     #k=data["index"]
    k=0
    ### for the moment only first order interactions...the second order seems to be big in size
    for gene in genes:
        k=k+1
#         method="network"
        request_url=buildStringRequest([gene], method)
        request_url += "&add_nodes=100000" 
        print(gene)
        print(request_url)
        res=requests.get(request_url)
        if res.status_code==200:
            res = res.json()
            for r in res:
                d=check(d, r)
    return d

In [34]:
def StringInteractionAll(String, method):
    d = {"stringId_A": [], "stringId_B": [], "preferredName_A": [], "preferredName_B": []}
    genes = String["StringIDs"]
    
    request_url=buildStringRequest(genes, method)
    request_url += "&add_nodes=100000" 
    print(genes)
    print(request_url)
    res=requests.get(request_url)
    if res.status_code==200:
        res = res.json()
        for r in res:
            d=check(d, r)
    return d

In [35]:
res_ip=StringInteractionAll(String, "interaction_partners")

['9606.ENSP00000324287', '9606.ENSP00000265846', '9606.ENSP00000387282', '9606.ENSP00000377833', '9606.ENSP00000302898', '9606.ENSP00000357748', '9606.ENSP00000355231', '9606.ENSP00000307713', '9606.ENSP00000369497', '9606.ENSP00000320866', '9606.ENSP00000339191', '9606.ENSP00000354947', '9606.ENSP00000314458', '9606.ENSP00000378699', '9606.ENSP00000269122', '9606.ENSP00000444856', '9606.ENSP00000255448', '9606.ENSP00000345997', '9606.ENSP00000406046', '9606.ENSP00000233057', '9606.ENSP00000355809', '9606.ENSP00000259008', '9606.ENSP00000311962', '9606.ENSP00000216410', '9606.ENSP00000264009', '9606.ENSP00000360876', '9606.ENSP00000404121', '9606.ENSP00000261023', '9606.ENSP00000368020', '9606.ENSP00000368982', '9606.ENSP00000357283', '9606.ENSP00000263056', '9606.ENSP00000271555', '9606.ENSP00000263025', '9606.ENSP00000262445', '9606.ENSP00000423673', '9606.ENSP00000378323', '9606.ENSP00000360683', '9606.ENSP00000284957', '9606.ENSP00000245304', '9606.ENSP00000337761', '9606.ENSP00000

In [41]:
len(set(res_ip["preferredName_B"]))

5609

In [24]:
def check(d, r):
    ascore= r["ascore"]
    escore= r["escore"]
    dscore= r["dscore"]
    if ascore!=0 or escore!=0 or dscore!=0:
        d["stringId_A"].append(r["stringId_A"])
        d["stringId_B"].append(r["stringId_B"])
        d["preferredName_A"].append(r["preferredName_A"])
        d["preferredName_B"].append(r["preferredName_B"])
    return d        

In [26]:
res=StringInteraction(String, "interaction_partners")

9606.ENSP00000324287
http://string-db.org/api/json/interaction_partners?identifiers=%0D9606.ENSP00000324287&species=9606&caller_identity=SapienzaBioInformatics&add_nodes=100000
9606.ENSP00000265846
http://string-db.org/api/json/interaction_partners?identifiers=%0D9606.ENSP00000265846&species=9606&caller_identity=SapienzaBioInformatics&add_nodes=100000
9606.ENSP00000387282
http://string-db.org/api/json/interaction_partners?identifiers=%0D9606.ENSP00000387282&species=9606&caller_identity=SapienzaBioInformatics&add_nodes=100000
9606.ENSP00000377833
http://string-db.org/api/json/interaction_partners?identifiers=%0D9606.ENSP00000377833&species=9606&caller_identity=SapienzaBioInformatics&add_nodes=100000
9606.ENSP00000302898
http://string-db.org/api/json/interaction_partners?identifiers=%0D9606.ENSP00000302898&species=9606&caller_identity=SapienzaBioInformatics&add_nodes=100000
9606.ENSP00000357748
http://string-db.org/api/json/interaction_partners?identifiers=%0D9606.ENSP00000357748&species

In [28]:
len(res["preferredName_A"])

12071

In [40]:
len(set(res["preferredName_A"]))

54

In [29]:
l = list(set(res["preferredName_A"]).union(set(res["preferredName_B"])))

In [30]:
len(l)

5619

In [19]:
l = list(set(String["preferredName_A"]).union(set(String["preferredName_B"])))

In [None]:
temp = [i+"_HUMAN" for i in l]

In [24]:
# partially correct ---> use also HGNC
# mapping = u.mapping("ID", "ACC", query=temp )

# t=0
# for i in words:
#     t+=1
#     try:
#         dictio = hgnc.fetch('symbol', i)
#         new_dictio = dictio['response']['docs'][0]
#         mapping_string[i] = new_dictio['uniprot_ids'][0]
#     except:
#         print(i)
#         print(t)
#         data.append(i)

In [None]:
# def worker(genes):
#     string = []
#     n = len(genes)
#     #url="http://string-gamma.org/api"
#     method="network"
#     print("Worker")
#     k = 0
#     for binary in genes:
#         k +=1
#         request_url=buildStringRequest(binary, method)
#         request_url += "&add_nodes=100000" 
#         #print(request_url)
#         r=requests.get(request_url)
#         if r.status_code == 200:
#             r = r.json()
#             string.extend(r)
            
#         if k % 20 == 0:
#             print(k)
#     return string

In [None]:
# def generate_batch(String, n_pool):
#     Str = String["StringIDs"]
#     n = len(Str)
#     genes = []
#     Str_par = []
#     for i in range(n):
#         for j in range((i+1),n):
#             genes.append((Str[i], Str[j]))

#     ofs = math.ceil(len(genes)/n_pool)
#     for i in range(n_pool):
#         s = int( i * ofs )
#         e = int( (i+1) * ofs )
#         Str_par.append(genes[s:e])
#     return Str_par

In [None]:
# def parallel(String, worker, n_pool=4):
    
#     data = generate_batch(String, n_pool)
    
#     pool = ThreadPool(n_pool)
#     results = pool.map(worker, data)
#     pool.close() 
#     pool.join()
    
#     return results

In [None]:
# results = parallel(String, worker)

## [APID](http://cicblade.dep.usal.es:8080/APID/Interactions.action?protein=Q15057) for query use UniprotID

In [None]:
import requests
from bs4 import BeautifulSoup as bs

In [None]:
def APID_routine(query):
    data = []
    r = requests.get(query)
    print(r.status_code)
    if r.status_code == 200:
        soup = bs(r.text, "lxml")
        for element in soup.find_all("a"):
            logic = element.get("href").startswith("http://www.uniprot.org/uniprot/")
            if logic:
                data.append(element.get("href").split("uniprot/")[1])
    return data

In [None]:
query = "http://cicblade.dep.usal.es:8080/APID/Interactions.action?protein=" + uniAC[0]

In [None]:
uniAC = load_json("./UniProt.json")["uniprot_AC"]

In [None]:
import time
def APID_api(seed):
    apid = {}
    for gene in seed:
        print(gene)
        query = "http://cicblade.dep.usal.es:8080/APID/Interactions.action?protein=" + gene
        data = APID_routine(query)
        apid[gene] = data[:]
        print(query)
        time.sleep(2)
    return apid

In [None]:
apid = APID_api(uniAC)

In [277]:
map_apid=load_json("../data/apid/apid_map_uniprotAC_sym.json")

In [281]:
map_string=load_json("../data/string/String_map_UniprotID_UniprotAC.json")