### Download sequence and structure data from Uniprot

In [82]:
##import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os

In [49]:
##get uniprot id and processed sequence
data = pd.read_excel('seq.xlsx', header=0)
data.head()

Unnamed: 0,CPid,Uniprot,Origin_Seq
0,CP00295,P85421,IWGIGCNP
1,CP01056,P84645,GDPTFCGETCRVIPVCTYSAALGCTCDDRSDGLCKRN
2,CP02628,P01050,LTYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...
3,CP02629,P01263,CSNLSTCVLGKLSQELHKLQTYPRTNTGSGTP
4,CP02634,P01225,NSCELTNITIAIEKEECRFCISINTTWCAGYCYTRDLVYKDPARPK...


In [58]:
CPids = []
Uniprot = []
Origin_Seq = []
for i in range(len(data)):
    CPid = data.CPid[i]
    seq = data.Origin_Seq[i]
    Uids = data.Uniprot[i].replace(" ", "").split("##")
    for Uid in Uids:
        CPids.append(CPid)
        Uniprot.append(Uid)
        Origin_Seq.append(seq)
data2 = pd.DataFrame({
    'CPid':CPids,
    'Uniprot':Uniprot,
    'Origin_Seq':Origin_Seq
})
print(len(data2))
data2.head(15)

667


Unnamed: 0,CPid,Uniprot,Origin_Seq
0,CP00295,P85421,IWGIGCNP
1,CP01056,P84645,GDPTFCGETCRVIPVCTYSAALGCTCDDRSDGLCKRN
2,CP02628,P01050,LTYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...
3,CP02629,P01263,CSNLSTCVLGKLSQELHKLQTYPRTNTGSGTP
4,CP02634,P01225,NSCELTNITIAIEKEECRFCISINTTWCAGYCYTRDLVYKDPARPK...
5,CP02712,P01160,SLRRSSCFGGRMDRIGAQSGLGCNSFRY
6,CP02811,P22226,RLCRIVVIRVCR
7,CP02816,P56879,GVIPCGESCVFIPCISTLLGCSCKNKVCYRN
8,CP02817,P56872,SIPCGESCVFIPCTVTALLGCSCKSKVCYKN
9,CP02819,P82271,GFCRCLCRRGVCRCICTR


In [84]:
##download uniprot entry info
for uid in data2.Uniprot:
    if not os.path.exists("xmls/"+uid+".xml"):
        print(uid)
        response = requests.get('https://rest.uniprot.org/uniprotkb/'+uid+'.xml')

        if response.status_code == 200:
            with open("xmls/"+uid+".xml", "wb") as file:
                file.write(response.content)
        else:
            print("请求失败！状态码：%d" % response.status_code)

In [63]:
seq_len = []
for seq in data2.Origin_Seq:
    if seq == 'None':
        seq_len.append(0)
    else:
        seq_len.append(len(seq))
data2['len'] = seq_len

In [64]:
data2.head()

Unnamed: 0,CPid,Uniprot,Origin_Seq,len
0,CP00295,P85421,IWGIGCNP,8
1,CP01056,P84645,GDPTFCGETCRVIPVCTYSAALGCTCDDRSDGLCKRN,37
2,CP02628,P01050,LTYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,65
3,CP02629,P01263,CSNLSTCVLGKLSQELHKLQTYPRTNTGSGTP,32
4,CP02634,P01225,NSCELTNITIAIEKEECRFCISINTTWCAGYCYTRDLVYKDPARPK...,111


In [117]:
def PDB_filter(tag):
    return tag.name == 'dbReference' and tag['type'] == 'PDB' and tag.contents
def seq_filter(tag):
    return tag.has_attr('length') and tag.name == 'sequence' 

AF_id = []
U_seqlen = []
PDB_ids_list = []
PDB_method_list = []
PDB_chains_list = []

for uid in data2.Uniprot:
    peptide_xml = BeautifulSoup(open("xmls/"+uid+".xml"), 'xml')
    
    ##AlphaFold id
    AF = []
    AF_xml = peptide_xml.find_all('dbReference', attrs={"type":"AlphaFoldDB"})
    for af in AF_xml: AF.append(af.attrs['id'])
    AF_id.append('##'.join(str(x) for x in AF))
    
    ##Sequence length
    seq_length = peptide_xml.find(seq_filter).attrs['length']
    U_seqlen.append(int(seq_length))
    
    ##PDB ids
    PDB_ids = []
    PDB_method = []
    PDB_chains = []
    PDBs = peptide_xml.find_all(PDB_filter)
    for PDB in PDBs: 
        PDB_ids.append(PDB.attrs['id'])
        properties_method = PDB.find('property', attrs={"type":"method"}).attrs['value']
        properties_chains = PDB.find('property', attrs={"type":"chains"}).attrs['value']
        PDB_method.append(properties_method)
        PDB_chains.append(properties_chains)
    PDB_ids_list.append('##'.join(str(x) for x in PDB_ids)) 
    PDB_method_list.append('##'.join(str(x) for x in PDB_method)) 
    PDB_chains_list.append('##'.join(str(x) for x in PDB_chains))   

In [118]:
data2['AF_id'] = AF_id
data2['U_seqlen'] = U_seqlen
data2['PDB_ids_list'] = PDB_ids_list
data2['PDB_method_list'] = PDB_method_list
data2['PDB_chains_list'] = PDB_chains_list

In [109]:
data2.head()

Unnamed: 0,CPid,Uniprot,Origin_Seq,len,AF_id,U_seqlen,PDB_ids_list,PDB_method_list,PDB_chains_list
0,CP00295,P85421,IWGIGCNP,8,P85421,23,1K83##2VUM##3CQZ##6EXV,X-ray##X-ray##X-ray##EM,M=1-8##M=1-8##M=1-8##M=1-8
1,CP01056,P84645,GDPTFCGETCRVIPVCTYSAALGCTCDDRSDGLCKRN,37,P84645,37,1R1F,NMR,A=4-37
2,CP02628,P01050,LTYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,65,P01050,65,1AD8##1AE8##1AFE##1AHT##1AI8##1AWF##1AY6##1BA8...,X-ray##X-ray##X-ray##X-ray##X-ray##X-ray##X-ra...,I=55-64##I=55-64##I=55-64##I=55-64##I=55-64##I...
3,CP02629,P01263,CSNLSTCVLGKLSQELHKLQTYPRTNTGSGTP,32,P01263,136,2GLG##2GLH##6PGQ,NMR##NMR##X-ray,A=83-114##A=83-114##B=104-114
4,CP02634,P01225,NSCELTNITIAIEKEECRFCISINTTWCAGYCYTRDLVYKDPARPK...,111,P01225,129,1FL7##1XWD##4AY9##4MQW##8I2G,X-ray##X-ray##X-ray##X-ray##EM,B/D=19-129##B/E=19-129##B/E/H=19-129##B/E/H=19...


In [119]:
data2.to_excel('seq_AF.xlsx', index=False, header=True)