In [1]:
import pandas as pd
import numpy as np
import re

import glob
import os
#pdb structure list from Alphafold
afdir = '/home/adunford/alphafold_reference/UP000005640_9606_HUMAN_v3/'
afpdblist = pd.Series([path.replace(afdir,'') for path in glob.glob('%s/*pdb.gz' % afdir)])
print(afpdblist)

0        AF-A6NMB9-F1-model_v3.pdb.gz
1        AF-Q15388-F1-model_v3.pdb.gz
2        AF-Q9UJH8-F1-model_v3.pdb.gz
3        AF-Q9UMZ2-F1-model_v3.pdb.gz
4        AF-B1AJZ9-F1-model_v3.pdb.gz
                     ...             
23386    AF-P19113-F1-model_v3.pdb.gz
23387    AF-Q9Y535-F1-model_v3.pdb.gz
23388    AF-Q96BR1-F1-model_v3.pdb.gz
23389    AF-Q99470-F1-model_v3.pdb.gz
23390    AF-P00748-F1-model_v3.pdb.gz
Length: 23391, dtype: object


In [2]:
#prot2AF mapping - needed to loop over, scatter/gather all structures
#current version of clumps seems to depend on having the 4 character code in order to access pdb structures
#new version may have 1:1 uniprot:structure connection 
p2af = pd.DataFrame(columns=['u1', #uniprot id
                             'u2', #also uniprot iD (should be the same for AF, )
                             'struct', #name of structure/chain - 1 chain per peptide in AF?
                             'blat', #- not relevant for AF
                             'AA:AA list' #colon separated amino acid positions - should be 1:1, 2:2 etc. for AF? - revisit this
                            ])
p2af['u1'] = [x[1] for x in afpdblist.str.split('-')]
p2af['u2'] = p2af['u1']
p2af['blat'] = '-'
p2af

Unnamed: 0,u1,u2,struct,blat,AA:AA list
0,A6NMB9,A6NMB9,,-,
1,Q15388,Q15388,,-,
2,Q9UJH8,Q9UJH8,,-,
3,Q9UMZ2,Q9UMZ2,,-,
4,B1AJZ9,B1AJZ9,,-,
...,...,...,...,...,...
23386,P19113,P19113,,-,
23387,Q9Y535,Q9Y535,,-,
23388,Q96BR1,Q96BR1,,-,
23389,Q99470,Q99470,,-,


In [3]:
gpmapheader =[
    'chr',
    'gp1',
    'gp2',
    'exons',
    'strand',
    'uniprotid',
    'AA_list',
    'unknown']

gpmap = pd.read_csv('/mnt/nfs/ro_disks/canine-b7f9872002626d1a514fcdf5e295a8bd/gpmaps/genomeProteomeMaps.txt',
                   header =None,
                   sep = '\t')
gpmap

Unnamed: 0,0,1,2,3,4,5,6,7
0,chr3,196439406,196460770,"196439406-196439515,196443728-196443792,196449...",+,Q8TBF5,"1.0-37.1,37.1-58.2,58.2-106.0,106.0-177.1,177....",
1,chr12,42706871,42717904,"42706871-42707000,42707491-42707567,42707675-4...",-,Q8TBF4,"174.0-217.0,148.2-174.0,111.0-148.2,75.0-111.0...",
2,chr2,55777049,55844421,"55777049-55777142,55785910-55786104,55791448-5...",-,Q5MIZ7,"818.0-849.0,753.1-818.0,691.2-753.1,645.0-691....",
3,chr11,47640387,47664017,"47640387-47640471,47644252-47644328,47647225-4...",-,Q9Y6C9,"275.0-303.0,249.2-275.0,227.0-249.2,211.0-227....",
4,chr11,93212178,93212355,93212178-93212355,-,Q9NRQ5,0.0-59.0,
...,...,...,...,...,...,...,...,...
20186,chr10,70105527,70166813,"70105527-70105668,70105763-70105841,70115109-7...",-,Q8WXA3,"608.0-655.0,582.0-608.0,565.2-582.0,534.0-565....",
20187,chr11,125616199,125618625,"125616199-125616251,125616555-125616591,125617...",+,Q8WXA2,"0.0-17.1,17.1-29.1,29.1-41.1,41.1-82.1,82.1-126.0",
20188,chr8,64081436,64122261,"64081436-64081460,64081945-64081970,64087889-6...",+,Q7Z739,"0.0-8.0,8.0-16.1,16.1-45.0,45.0-53.0,54.0-578....",
20189,chr8,12580607,12612929,"12580607-12580763,12583235-12583388,12586409-1...",-,Q17RB8,"721.0-773.0,670.0-721.0,615.2-670.0,562.2-615....",


In [4]:
gpmap.columns = gpmapheader
gpmap.index = gpmap['uniprotid']
gpmap

Unnamed: 0_level_0,chr,gp1,gp2,exons,strand,uniprotid,AA_list,unknown
uniprotid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q8TBF5,chr3,196439406,196460770,"196439406-196439515,196443728-196443792,196449...",+,Q8TBF5,"1.0-37.1,37.1-58.2,58.2-106.0,106.0-177.1,177....",
Q8TBF4,chr12,42706871,42717904,"42706871-42707000,42707491-42707567,42707675-4...",-,Q8TBF4,"174.0-217.0,148.2-174.0,111.0-148.2,75.0-111.0...",
Q5MIZ7,chr2,55777049,55844421,"55777049-55777142,55785910-55786104,55791448-5...",-,Q5MIZ7,"818.0-849.0,753.1-818.0,691.2-753.1,645.0-691....",
Q9Y6C9,chr11,47640387,47664017,"47640387-47640471,47644252-47644328,47647225-4...",-,Q9Y6C9,"275.0-303.0,249.2-275.0,227.0-249.2,211.0-227....",
Q9NRQ5,chr11,93212178,93212355,93212178-93212355,-,Q9NRQ5,0.0-59.0,
...,...,...,...,...,...,...,...,...
Q8WXA3,chr10,70105527,70166813,"70105527-70105668,70105763-70105841,70115109-7...",-,Q8WXA3,"608.0-655.0,582.0-608.0,565.2-582.0,534.0-565....",
Q8WXA2,chr11,125616199,125618625,"125616199-125616251,125616555-125616591,125617...",+,Q8WXA2,"0.0-17.1,17.1-29.1,29.1-41.1,41.1-82.1,82.1-126.0",
Q7Z739,chr8,64081436,64122261,"64081436-64081460,64081945-64081970,64087889-6...",+,Q7Z739,"0.0-8.0,8.0-16.1,16.1-45.0,45.0-53.0,54.0-578....",
Q17RB8,chr8,12580607,12612929,"12580607-12580763,12583235-12583388,12586409-1...",-,Q17RB8,"721.0-773.0,670.0-721.0,615.2-670.0,562.2-615....",


In [5]:
p2pdb = pd.read_csv('/mnt/nfs/ro_disks/canine-b7f9872002626d1a514fcdf5e295a8bd/prot2pdb_chunks/huniprot2pdb.run18_chunks/huniprot2pdb_chunk_00000.gz',
                    header =None,
                    sep='\t')

In [6]:
p2pdb

Unnamed: 0,0,1,2,3,4
0,P55008,P55008,2d58-A,-,17:17 18:18 19:19 20:20 21:21 22:22 23:23 24:2...
1,P55008,P55008,2g2b-A,-,2:5 3:6 4:7 5:8 6:9 7:10 8:11 9:12 10:13 11:14...
2,A0A0B4J1V1,P01764,4kfz-D,uhit_identity:86.32 evalue:2e-71 pdb_identity:...,16:-3 17:-2 18:-1 19:0 20:1 21:2 22:3 23:4 24:...
3,Q8NGK1,P02699,4x1h-A,uhit_identity:21.5 evalue:0.0002 pdb_identity:...,34:42 35:43 36:44 37:45 38:46 39:47 40:48 41:4...
4,Q99218,C9E3B4,3ipk-B,uhit_identity:25 evalue:9e-05 pdb_identity:22.22,86:866 87:867 88:868 89:869 90:870 91:871 92:8...
...,...,...,...,...,...
295,P23368,P23368,1pj2-D,-,21:3021 22:3022 23:3023 24:3024 25:3025 26:302...
296,P23368,P23368,1pj4-A,-,22:22 23:23 24:24 25:25 26:26 27:27 28:28 29:2...
297,P23368,P23368,1gz4-B,-,23:23 24:24 25:25 26:26 27:27 28:28 29:29 30:3...
298,P23368,P23368,1pj3-D,-,21:3021 22:3022 23:3023 24:3024 25:3025 26:302...


In [7]:
print(len(p2af.index))
p2af = p2af.loc[p2af['u1'].isin(gpmap['uniprotid'])]
print(len(p2af.index))


23391
22627


In [8]:
p2af.index = p2af['u1']
p2af = p2af.sort_index()


In [10]:
for i in np.arange(0,len(p2af.index)):
    afid = hex(int('1000',16)+i)[2:]
    g = p2af.index[i]
    p2af.iloc[i,2] = '%s-A' % afid
    
for g in p2af['u1'].unique():
    aa_list = pd.Series(re.split('-|,',gpmap.loc[g,'AA_list']),dtype=float).astype(int)
    start = min(aa_list) + 1 #adding 1 to make 1-based
    end = max(aa_list) + 1  #check this if errors caused, might not need to add 1
    aaaa = ''
    for j in np.arange(start,end):
        aai = '%s:%s ' %(j,j)
        aaaa = aaaa + (aai)
    p2af.loc[g,'AA:AA list'] = aaaa
    

In [9]:
p2af

Unnamed: 0_level_0,u1,u2,struct,blat,AA:AA list
u1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A0A087WTH1,A0A087WTH1,A0A087WTH1,,-,
A0A087WVF3,A0A087WVF3,A0A087WVF3,,-,
A0A087WXM9,A0A087WXM9,A0A087WXM9,,-,
A0A087WXS9,A0A087WXS9,A0A087WXS9,,-,
A0A087X179,A0A087X179,A0A087X179,,-,
...,...,...,...,...,...
Q9Y6Z4,Q9Y6Z4,Q9Y6Z4,,-,
Q9Y6Z5,Q9Y6Z5,Q9Y6Z5,,-,
Q9Y6Z7,Q9Y6Z7,Q9Y6Z7,,-,
S4R3P1,S4R3P1,S4R3P1,,-,


In [11]:
out_dir = 'af_dir'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
for uid in p2af['u1']:
    nuid = sum(p2af['u1'].str.contains(uid))
    auid = len(afpdblist[afpdblist.str.contains(uid)])
    if nuid!=auid:
        msg = "%s entries of %s found in protein-af mapping, %s entries found in alphafold" % (nuid,uid,auid)
        print(msg)
    pstructs = p2af.loc[uid,'struct']
    if(nuid ==1):
        afstruct = '%sAF-%s-F%s-model_v3.pdb.gz' % (afdir,uid,1)
        pstruct = str(pstructs[:-2])
        subdir=str(pstruct[1:3])
        cmd = 'cp %s %s/%s/pdb%s.ent.gz' %(afstruct,out_dir,subdir,pstruct)
        #print(cmd)
        if not os.path.exists('%s/%s' %(out_dir,subdir)):
            os.mkdir('%s/%s' %(out_dir,subdir))
        os.system(cmd)
    else:
        for j in np.arange(0,nuid):
            afstruct = '%sAF-%s-F%s-model_v3.pdb.gz' % (afdir,uid,j+1)
            if not os.path.exists(afstruct):
                print('file not found: %s' %afstruct)
                continue
            pstruct = pstructs[j]
            pstruct = str(pstruct[:-2])
            subdir=str(pstruct[1:3])
            cmd = 'cp %s %s/%s/pdb%s.ent.gz' %(afstruct,out_dir,subdir,pstruct)
            if not os.path.exists('%s/%s' %(out_dir,subdir)):
                os.mkdir('%s/%s' %(out_dir,subdir))
            os.system(cmd)

In [12]:
#print out list of prot2AF mapping
p2af.to_csv('uniprot2af_map.txt',
            sep='\t',
            header=False,
            index=False)

In [13]:
p2af

Unnamed: 0_level_0,u1,u2,struct,blat,AA:AA list
u1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A0A087WTH1,A0A087WTH1,A0A087WTH1,1000-A,-,1:1 2:2 3:3 4:4 5:5 6:6 7:7 8:8 9:9 10:10 11:1...
A0A087WVF3,A0A087WVF3,A0A087WVF3,1001-A,-,1:1 2:2 3:3 4:4 5:5 6:6 7:7 8:8 9:9 10:10 11:1...
A0A087WXM9,A0A087WXM9,A0A087WXM9,1002-A,-,1:1 2:2 3:3 4:4 5:5 6:6 7:7 8:8 9:9 10:10 11:1...
A0A087WXS9,A0A087WXS9,A0A087WXS9,1003-A,-,1:1 2:2 3:3 4:4 5:5 6:6 7:7 8:8 9:9 10:10 11:1...
A0A087X179,A0A087X179,A0A087X179,1004-A,-,1:1 2:2 3:3 4:4 5:5 6:6 7:7 8:8 9:9 10:10 11:1...
...,...,...,...,...,...
Q9Y6Z4,Q9Y6Z4,Q9Y6Z4,685e-A,-,1:1 2:2 3:3 4:4 5:5 6:6 7:7 8:8 9:9 10:10 11:1...
Q9Y6Z5,Q9Y6Z5,Q9Y6Z5,685f-A,-,1:1 2:2 3:3 4:4 5:5 6:6 7:7 8:8 9:9 10:10 11:1...
Q9Y6Z7,Q9Y6Z7,Q9Y6Z7,6860-A,-,1:1 2:2 3:3 4:4 5:5 6:6 7:7 8:8 9:9 10:10 11:1...
S4R3P1,S4R3P1,S4R3P1,6861-A,-,1:1 2:2 3:3 4:4 5:5 6:6 7:7 8:8 9:9 10:10 11:1...


In [16]:
out_dir = 'prot2AF_chunks'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
gl = len(p2af.index)
start = 0
end = 300
i = 0
while end < gl:
    if i < 10:
        num = '0000%s' %i
    elif i < 100:
        num = '000%s' %i
    elif i < 1000:
        num = '00%s' %i
    fn = 'prot2AF_chunks/prot2af_%s.gz' %num
    p2af.iloc[start:end].to_csv(fn,
            sep='\t',
            header=False,
            index=False)
    i = i + 1
    start = end
    end = end+300
end = gl


if i < 10:
    num = '0000%s' %i
elif i < 100:
    num = '000%s' %i
elif i < 1000:
    num = '00%s' %i
fn = 'prot2AF_chunks/prot2af_%s.gz' %num

p2af.iloc[start:end].to_csv(fn,
        sep='\t',
        header=False,
        index=False)