In [1]:
import pandas as pd
import numpy as np
import re

import glob
import os
#pdb structure list from Alphafold
afdir = '/home/adunford/afref/UP000005640_9606_HUMAN_v3/'
afpdblist = pd.Series([path.replace(afdir,'') for path in glob.glob('%s/*pdb.gz' % afdir)])


#prot2AF mapping - needed to loop over, scatter/gather all structures
#current version of clumps seems to depend on having the 4 character code in order to access pdb structures
#new version may have 1:1 uniprot:structure connection 
p2af = pd.DataFrame(columns=['u1', #uniprot id
                             'u2', #also uniprot iD (should be the same for AF, )
                             'struct', #name of structure/chain - 1 chain per peptide in AF?
                             'blat', #- not relevant for AF
                             'AA:AA list' #colon separated amino acid positions - should be 1:1, 2:2 etc. for AF? - revisit this
                            ])
p2af['u1'] = [x[1] for x in afpdblist.str.split('-')]
p2af['u2'] = p2af['u1']
p2af['blat'] = '-'
p2af

#p2af.index = p2af['u1']
p2af = p2af.sort_values('u1')


p2af.index = np.arange(0, len(p2af))
p2af

for i in np.arange(0,len(p2af.index)):
    afid = hex(int('1000',16)+i)[2:]
    g = p2af.index[i]
    p2af.iloc[i,2] = '%s-A' % afid

    

p2af

Unnamed: 0,u1,u2,struct,blat,AA:AA list
0,A0A024R1R8,A0A024R1R8,1000-A,-,
1,A0A024RBG1,A0A024RBG1,1001-A,-,
2,A0A024RCN7,A0A024RCN7,1002-A,-,
3,A0A075B6H5,A0A075B6H5,1003-A,-,
4,A0A075B6H7,A0A075B6H7,1004-A,-,
...,...,...,...,...,...
23386,U3KPV4,U3KPV4,6b5a-A,-,
23387,V9GZ13,V9GZ13,6b5b-A,-,
23388,W5XKT8,W5XKT8,6b5c-A,-,
23389,W6CW81,W6CW81,6b5d-A,-,


In [2]:
import prody
from prody import parsePDBHeader
out_dir = 'af_dir'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
for uid in np.unique(p2af['u1']):
    nuid = sum(p2af['u1'] == uid)
    auid = len(afpdblist[afpdblist.str.contains(uid)])
    if nuid!=auid:
        msg = "%s entries of %s found in protein-af mapping, %s entries found in alphafold" % (nuid,uid,auid)
        print(msg)
    pstructs = p2af.loc[p2af['u1'] == uid,'struct'].values
    ix = p2af.loc[p2af['u1'] == uid].index
    for j in np.arange(0,nuid):
        afstruct = '%sAF-%s-F%s-model_v3.pdb.gz' % (afdir,uid,j+1)


        try:
            afheader = parsePDBHeader(afstruct)
        except:
            print('failed to load pdb struct for %s' %uid)
            continue

        afstart,_,gstart = afheader['A'].dbrefs[0].first
        afend,_,gend = afheader['A'].dbrefs[0].last
        aaaa = ''
        afaa = np.arange(afstart,afend)
        gaa = np.arange(gstart,gend)
        if len(gaa) != len(afaa):
            print('length mismatch for %s.  AF struct is length %s, while genomic length is %s' % (uid, len(afaa),len(gaa)))
            continue
        for i in     np.arange(0,len(afaa)):
            aai = '%s:%s ' %(gaa[i],afaa[i])
            aaaa = aaaa + (aai)
        #must remove trailing space or postprocessing will fail
        aaaa = aaaa.strip()
        p2af.loc[ix[j],'AA:AA list'] = aaaa


        if not os.path.exists(afstruct):
            print('file not found: %s' %afstruct)
            continue
        pstruct = pstructs[j]
        pstruct = str(pstruct[:-2])
        subdir=str(pstruct[1:3])
        cmd = 'cp %s %s/%s/pdb%s.ent.gz' %(afstruct,out_dir,subdir,pstruct)
        if not os.path.exists('%s/%s' %(out_dir,subdir)):
            os.mkdir('%s/%s' %(out_dir,subdir))
        os.system(cmd)

failed to load pdb struct for A0A024R1R8
failed to load pdb struct for A0A024RBG1
failed to load pdb struct for A0A024RCN7
failed to load pdb struct for A0A075B6H5
failed to load pdb struct for A0A075B6H7
failed to load pdb struct for A0A075B6H8
failed to load pdb struct for A0A075B6H9
failed to load pdb struct for A0A075B6I0
failed to load pdb struct for A0A075B6I1
failed to load pdb struct for A0A075B6I3
failed to load pdb struct for A0A075B6I4
failed to load pdb struct for A0A075B6I6
failed to load pdb struct for A0A075B6I7
failed to load pdb struct for A0A075B6I9
failed to load pdb struct for A0A075B6J1
failed to load pdb struct for A0A075B6J2
failed to load pdb struct for A0A075B6J6
failed to load pdb struct for A0A075B6J9
failed to load pdb struct for A0A075B6K0
failed to load pdb struct for A0A075B6K2
failed to load pdb struct for A0A075B6K4
failed to load pdb struct for A0A075B6K5
failed to load pdb struct for A0A075B6K6
failed to load pdb struct for A0A075B6L2
failed to load p

failed to load pdb struct for A0A0B4J280
failed to load pdb struct for A0A0B4J2A2
failed to load pdb struct for A0A0B4J2B6
failed to load pdb struct for A0A0B4J2B8
failed to load pdb struct for A0A0B4J2D5
failed to load pdb struct for A0A0B4J2D9
failed to load pdb struct for A0A0B4J2E0
failed to load pdb struct for A0A0B4J2E5
failed to load pdb struct for A0A0B4J2F0
failed to load pdb struct for A0A0B4J2F2
failed to load pdb struct for A0A0B4J2H0
failed to load pdb struct for A0A0C4DGP1
failed to load pdb struct for A0A0C4DH24
failed to load pdb struct for A0A0C4DH25
failed to load pdb struct for A0A0C4DH26
failed to load pdb struct for A0A0C4DH27
failed to load pdb struct for A0A0C4DH28
failed to load pdb struct for A0A0C4DH29
failed to load pdb struct for A0A0C4DH30
failed to load pdb struct for A0A0C4DH31
failed to load pdb struct for A0A0C4DH32
failed to load pdb struct for A0A0C4DH33
failed to load pdb struct for A0A0C4DH34
failed to load pdb struct for A0A0C4DH35
failed to load p

failed to load pdb struct for A0A1B0GWB2
failed to load pdb struct for A0A1B0GWG4
failed to load pdb struct for A0A1B0GWH4
failed to load pdb struct for A0A1B0GWH6
failed to load pdb struct for A0A1B0GWI6
failed to load pdb struct for A0A1B0GWK0
failed to load pdb struct for A0A1B0GX31
failed to load pdb struct for A0A1B0GX49
failed to load pdb struct for A0A1B0GX51
failed to load pdb struct for A0A1B0GX56
failed to load pdb struct for A0A1B0GX68
failed to load pdb struct for A0A1B0GX78
failed to load pdb struct for A0A1B0GX95
failed to load pdb struct for A0A1B0GXF2
failed to load pdb struct for A0A1W2PN81
failed to load pdb struct for A0A1W2PNU3
failed to load pdb struct for A0A1W2PP81
failed to load pdb struct for A0A1W2PP97
failed to load pdb struct for A0A1W2PPD8
failed to load pdb struct for A0A1W2PPE2
failed to load pdb struct for A0A1W2PPE3
failed to load pdb struct for A0A1W2PPF3
failed to load pdb struct for A0A1W2PPG7
failed to load pdb struct for A0A1W2PPH5
failed to load p

length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while genomic length is 11399
length mismatch for Q8WZ42.  AF struct is length 1399, while geno

In [6]:
p2af = p2af.loc[~p2af['AA:AA list'].isna()]

In [7]:
#print out list of prot2AF mapping
p2af.to_csv('uniprot2af_map.txt',
            sep='\t',
            header=False,
            index=False)

In [8]:
out_dir = 'prot2AF_chunks'
chunk_length= 75
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
gl = len(p2af.index)
start = 0
end = chunk_length
i = 0
while end < gl:
    if i < 10:
        num = '0000%s' %i
    elif i < 100:
        num = '000%s' %i
    elif i < 1000:
        num = '00%s' %i
    fn = 'prot2AF_chunks/prot2af_%s.gz' %num
    p2af.iloc[start:end].to_csv(fn,
            sep='\t',
            header=False,
            index=False)
    i = i + 1
    start = end
    end = end+chunk_length
end = gl


if i < 10:
    num = '0000%s' %i
elif i < 100:
    num = '000%s' %i
elif i < 1000:
    num = '00%s' %i
fn = 'prot2AF_chunks/prot2af_%s.gz' %num

p2af.iloc[start:end].to_csv(fn,
        sep='\t',
        header=False,
        index=False)

In [4]:
#may not need gpmap in this script; moving to the end
#Do consider what work needs to be done to generate new gpmap; may have slightly different mapping for
#genome proteome AA positions
gpmapheader =[
    'chr',
    'gp1',
    'gp2',
    'exons',
    'strand',
    'uniprotid',
    'AA_list',
    'unknown']

gpmap = pd.read_csv('/mnt/nfs/ro_disks/canine-b7f9872002626d1a514fcdf5e295a8bd/gpmaps/genomeProteomeMaps.txt',
                   header =None,
                   sep = '\t')
gpmap

gpmap.columns = gpmapheader
gpmap.index = gpmap['uniprotid']
gpmap

Unnamed: 0,0,1,2,3,4,5,6,7
0,chr3,196439406,196460770,"196439406-196439515,196443728-196443792,196449...",+,Q8TBF5,"1.0-37.1,37.1-58.2,58.2-106.0,106.0-177.1,177....",
1,chr12,42706871,42717904,"42706871-42707000,42707491-42707567,42707675-4...",-,Q8TBF4,"174.0-217.0,148.2-174.0,111.0-148.2,75.0-111.0...",
2,chr2,55777049,55844421,"55777049-55777142,55785910-55786104,55791448-5...",-,Q5MIZ7,"818.0-849.0,753.1-818.0,691.2-753.1,645.0-691....",
3,chr11,47640387,47664017,"47640387-47640471,47644252-47644328,47647225-4...",-,Q9Y6C9,"275.0-303.0,249.2-275.0,227.0-249.2,211.0-227....",
4,chr11,93212178,93212355,93212178-93212355,-,Q9NRQ5,0.0-59.0,
...,...,...,...,...,...,...,...,...
20186,chr10,70105527,70166813,"70105527-70105668,70105763-70105841,70115109-7...",-,Q8WXA3,"608.0-655.0,582.0-608.0,565.2-582.0,534.0-565....",
20187,chr11,125616199,125618625,"125616199-125616251,125616555-125616591,125617...",+,Q8WXA2,"0.0-17.1,17.1-29.1,29.1-41.1,41.1-82.1,82.1-126.0",
20188,chr8,64081436,64122261,"64081436-64081460,64081945-64081970,64087889-6...",+,Q7Z739,"0.0-8.0,8.0-16.1,16.1-45.0,45.0-53.0,54.0-578....",
20189,chr8,12580607,12612929,"12580607-12580763,12583235-12583388,12586409-1...",-,Q17RB8,"721.0-773.0,670.0-721.0,615.2-670.0,562.2-615....",


In [6]:
p2pdb = pd.read_csv('/mnt/nfs/ro_disks/canine-b7f9872002626d1a514fcdf5e295a8bd/prot2pdb_chunks/huniprot2pdb.run18_chunks/huniprot2pdb_chunk_00000.gz',
                    header =None,
                    sep='\t')

p2pdb

In [8]:
print(len(p2af.index))
p2af = p2af.loc[p2af['u1'].isin(gpmap['uniprotid'])]
print(len(p2af.index))


23391
22627


In [None]:
    if(nuid ==1):
        afstruct = '%sAF-%s-F%s-model_v3.pdb.gz' % (afdir,uid,1)
        try:
            afheader = parsePDBHeader(afstruct)
        except:
            print('failed to load pdb struct for %s' %uid)
            continue
        afstart,_,gstart = afheader['A'].dbrefs[0].first
        afend,_,gend = afheader['A'].dbrefs[0].last
        aaaa = ''
        afaa = np.arange(afstart,afend)
        gaa = np.arange(gstart,gend)
        if len(gaa) != len(afaa):
            print('length mismatch for %s.  AF struct is length %s, while genomic length is %' % (uid, len(afaa),len(gaa)))
            continue
        for i in     np.arange(0,len(afaa)):
            aai = '%s:%s ' %(gaa[i],afaa[i])
            aaaa = aaaa + (aai)
        #must remove trailing space or postprocessing will fail
        aaaa = aaaa.strip()
        p2af.loc[p2af['u1'] == uid,'AA:AA list'] = aaaa
        pstruct = str(pstructs[:-2])
        subdir=str(pstruct[1:3])
        cmd = 'cp %s %s/%s/pdb%s.ent.gz' %(afstruct,out_dir,subdir,pstruct)
        #print(cmd)
        if not os.path.exists('%s/%s' %(out_dir,subdir)):
            os.mkdir('%s/%s' %(out_dir,subdir))
        os.system(cmd)
    else:
        ix = p2af.loc[p2af['u1'] == uid].index
        for j in np.arange(0,nuid):
            afstruct = '%sAF-%s-F%s-model_v3.pdb.gz' % (afdir,uid,j+1)
            
            
            try:
                afheader = parsePDBHeader(afstruct)
            except:
                print('failed to load pdb struct for %s' %uid)
                continue

            afstart,_,gstart = afheader['A'].dbrefs[0].first
            afend,_,gend = afheader['A'].dbrefs[0].last
            aaaa = ''
            afaa = np.arange(afstart,afend)
            gaa = np.arange(gstart,gend)
            if len(gaa) != len(afaa):
                print('length mismatch for %s.  AF struct is length %s, while genomic length is %s' % (uid, len(afaa),len(gaa)))
                continue
            for i in     np.arange(0,len(afaa)):
                aai = '%s:%s ' %(gaa[i],afaa[i])
                aaaa = aaaa + (aai)
            #must remove trailing space or postprocessing will fail
            aaaa = aaaa.strip()
            p2af.loc[ix[j],'AA:AA list'] = aaaa

            
            if not os.path.exists(afstruct):
                print('file not found: %s' %afstruct)
                continue
            pstruct = pstructs[j]
            pstruct = str(pstruct[:-2])
            subdir=str(pstruct[1:3])
            cmd = 'cp %s %s/%s/pdb%s.ent.gz' %(afstruct,out_dir,subdir,pstruct)
            if not os.path.exists('%s/%s' %(out_dir,subdir)):
                os.mkdir('%s/%s' %(out_dir,subdir))
            os.system(cmd)