In [1]:
import pandas as pd
import numpy as np
from prody import parsePDBStream, confProDy,parsePDBHeader
from clumps.utils import gunzipper, AMINO_ACID_MAP
import itertools
import glob
import os


#pdb structure list from Alphafold
afdir = '/home/adunford/afref/UP000005640_9606_HUMAN_v3/'
afpdblist = pd.Series([path.replace(afdir,'') for path in glob.glob('%s/*pdb.gz' % afdir)])
minconf = 50 
bl_length = 20
minlength = 10

#prot2AF mapping - needed to loop over, scatter/gather all structures
#current version of clumps seems to depend on having the 4 character code in order to access pdb structures
#may consider changing hard-coded line that requires above
#new version may have 1:1 uniprot:structure connection 
p2af = pd.DataFrame(columns=['u1', #uniprot id
                             'u2', #also uniprot iD (should be the same for AF, )
                             'struct', #name of structure/chain - 1 chain per peptide in AF?
                             'blat', #- not relevant for AF
                             'AA:AA list' #colon separated amino acid positions - should be 1:1, 2:2 etc. for AF? - revisit this
                            ])
#select uniprot in file name which has format AF-{Uniprotid}-F#-model_v3.pdb.gz
p2af['u1'] = [x[1] for x in afpdblist.str.split('-')]
p2af['u2'] = p2af['u1']
p2af['blat'] = '-'

#p2af.index = p2af['u1']
p2af = p2af.sort_values('u1')


p2af.index = np.arange(0, len(p2af))

for i in np.arange(0,len(p2af.index)):
    afid = hex(int('1000',16)+i)[2:]
    g = p2af.index[i]
    p2af.iloc[i,2] = '%s-A' % afid



confProDy(verbosity='none')

out_dir = 'af_dir'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
for uid in np.unique(p2af['u1']):
    nuid = sum(p2af['u1'] == uid)
    auid = len(afpdblist[afpdblist.str.contains(uid)])
    if nuid!=auid:
        msg = "%s entries of %s found in protein-af mapping, %s entries found in alphafold" % (nuid,uid,auid)
        print(msg)
    pstructs = p2af.loc[p2af['u1'] == uid,'struct'].values
    ix = p2af.loc[p2af['u1'] == uid].index

    for j in np.arange(0,nuid):
        affile = '%sAF-%s-F%s-model_v3.pdb.gz' % (afdir,uid,j+1)
        with gunzipper(affile) as afstream: 
            try:
                aa = parsePDBStream(afstream)
                afheader = parsePDBHeader(affile)
            except:
                print('failed to load pdb struct %s for %s' %(affile, uid))
                continue
            afstart,_,gstart = afheader['A'].dbrefs[0].first
            afend,_,gend = afheader['A'].dbrefs[0].last
            aaaa = ''
            afaa = np.arange(afstart,afend+1)
            gaa = np.arange(gstart,gend+1)
            xx = aa.getResnums()
            #do this check before anything else. For some reason sometimes start/end values in header get truncated
            #e.g. 1400 gets read as 14.  example structure with issue is AF-A0A087WUL8-F8-model_v3.pdb.gz
            if len(gaa) != len(afaa):
                print('length mismatch for %s.  AF struct is length %s, while genomic length is %s' % (uid, len(afaa),len(gaa)))
                continue
            #think this one is unnecessary as I'm not building 3d model yet
            #yy = aa.getCoords()
            zz = aa.getResnames()
            bb = aa.getBetas()
            pdb_resids = {}
            for i in range(len(xx)):
                pdb_resids[xx[i]] = (zz[i] in AMINO_ACID_MAP) and bb[i] > minconf
            coord = pd.Series(list(pdb_resids.keys()))
            quals = pd.Series(list(pdb_resids.values()))
            tids = coord[quals].tolist()
            fids = coord[quals != True]


            fids = np.split(fids,np.where(np.diff(fids)>1)[0]+1)
            for i in fids:
                ##print(len(i))
                if len(i) < bl_length:
                    tids = tids + i.tolist()
            pdb_resids = sorted(tids)
            #find longest contiguous run of quality amino acids
            #splits list into lists demarcated 
            #pdb_resids = max(np.split(pdb_resids, np.where(np.diff(pdb_resids) >= bl_length)[0] + 1), key=len).tolist()
            if len(pdb_resids)<=minlength:
                print ('no amino acid sequences longer than %s of sufficient quality for %s' % (minlength ,uid))
                continue
            #convert list of lists  back into single list.  Will fail if already a list.
            #try:
            #    pdb_resids = list(itertools.chain.from_iterable(pdb_resids[:]))
            #except:
            #    print("only one contig for %s" %  uid)
            #subtract 1 from each pdb resnumber to make it consistent with an index
            pdb_resids = [a - 1 for a in pdb_resids]
            afaa = afaa[pdb_resids]
            gaa = gaa[pdb_resids]


            for i in np.arange(0,len(afaa)):
                aai = '%s:%s ' %(gaa[i],afaa[i])
                aaaa = aaaa + (aai)
            #must remove trailing space or postprocessing will fail
            aaaa = aaaa.strip()
            p2af.loc[ix[j],'AA:AA list'] = aaaa

            if not os.path.exists(affile):
                print('file not found: %s' %affile)
                continue
            pstruct = pstructs[j]
            pstruct = str(pstruct[:-2])
            subdir=str(pstruct[1:3])
            cmd = 'cp %s %s/%s/pdb%s.ent.gz' %(affile,out_dir,subdir,pstruct)
            if not os.path.exists('%s/%s' %(out_dir,subdir)):
                os.mkdir('%s/%s' %(out_dir,subdir))
            os.system(cmd)

p2af = p2af.loc[~p2af['AA:AA list'].isna()]
#print out list of prot2AF mapping
p2af.to_csv('uniprot2af_map.txt',
            sep='\t',
            header=False,
            index=False)
out_dir = 'prot2AF_chunks'
chunk_length= 75
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
gl = len(p2af.index)
start = 0
end = chunk_length
i = 0
while end < gl:
    if i < 10:
        num = '0000%s' %i
    elif i < 100:
        num = '000%s' %i
    elif i < 1000:
        num = '00%s' %i
    fn = 'prot2AF_chunks/prot2af_%s.gz' %num
    p2af.iloc[start:end].to_csv(fn,
            sep='\t',
            header=False,
            index=False)
    i = i + 1
    start = end
    end = end+chunk_length
end = gl


if i < 10:
    num = '0000%s' %i
elif i < 100:
    num = '000%s' %i
elif i < 1000:
    num = '00%s' %i
fn = 'prot2AF_chunks/prot2af_%s.gz' %num

p2af.iloc[start:end].to_csv(fn,
        sep='\t',
        header=False,
        index=False)

failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A024R1R8-F1-model_v3.pdb.gz for A0A024R1R8
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A024RBG1-F1-model_v3.pdb.gz for A0A024RBG1
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A024RCN7-F1-model_v3.pdb.gz for A0A024RCN7
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A075B6H5-F1-model_v3.pdb.gz for A0A075B6H5
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A075B6H7-F1-model_v3.pdb.gz for A0A075B6H7
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A075B6H8-F1-model_v3.pdb.gz for A0A075B6H8
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A075B6H9-F1-model_v3.pdb.gz for A0A075B6H9
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A075B6I0-F1-model_v3.pdb.gz for A0A075B6I0
failed to load pdb struct /home/

failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A087WT02-F1-model_v3.pdb.gz for A0A087WT02
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A087WT03-F1-model_v3.pdb.gz for A0A087WT03
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A087WTH1-F1-model_v3.pdb.gz for A0A087WTH1
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A087WTH5-F1-model_v3.pdb.gz for A0A087WTH5
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A087WUL8-F1-model_v3.pdb.gz for A0A087WUL8
length mismatch for A0A087WUL8.  AF struct is length 1400, while genomic length is 15
length mismatch for A0A087WUL8.  AF struct is length 1400, while genomic length is 15
length mismatch for A0A087WUL8.  AF struct is length 1400, while genomic length is 15
length mismatch for A0A087WUL8.  AF struct is length 1400, while genomic length is 15
length mismatch for A0A087WUL8.  AF struct is lengt

failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0A6YYG3-F1-model_v3.pdb.gz for A0A0A6YYG3
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0A6YYJ7-F1-model_v3.pdb.gz for A0A0A6YYJ7
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0A6YYK1-F1-model_v3.pdb.gz for A0A0A6YYK1
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0A6YYK4-F1-model_v3.pdb.gz for A0A0A6YYK4
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0A6YYK6-F1-model_v3.pdb.gz for A0A0A6YYK6
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0A6YYK7-F1-model_v3.pdb.gz for A0A0A6YYK7
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0A6YYL3-F1-model_v3.pdb.gz for A0A0A6YYL3
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0B4J1T7-F1-model_v3.pdb.gz for A0A0B4J1T7
failed to load pdb struct /home/

failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0C4DH35-F1-model_v3.pdb.gz for A0A0C4DH35
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0C4DH36-F1-model_v3.pdb.gz for A0A0C4DH36
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0C4DH38-F1-model_v3.pdb.gz for A0A0C4DH38
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0C4DH39-F1-model_v3.pdb.gz for A0A0C4DH39
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0C4DH41-F1-model_v3.pdb.gz for A0A0C4DH41
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0C4DH42-F1-model_v3.pdb.gz for A0A0C4DH42
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0C4DH43-F1-model_v3.pdb.gz for A0A0C4DH43
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0C4DH55-F1-model_v3.pdb.gz for A0A0C4DH55
failed to load pdb struct /home/

failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0J9YY54-F1-model_v3.pdb.gz for A0A0J9YY54
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0J9YY99-F1-model_v3.pdb.gz for A0A0J9YY99
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0K0K1A3-F1-model_v3.pdb.gz for A0A0K0K1A3
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0K0K1A5-F1-model_v3.pdb.gz for A0A0K0K1A5
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0K0K1B3-F1-model_v3.pdb.gz for A0A0K0K1B3
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0K0K1C0-F1-model_v3.pdb.gz for A0A0K0K1C0
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0K0K1C4-F1-model_v3.pdb.gz for A0A0K0K1C4
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A0K0K1D8-F1-model_v3.pdb.gz for A0A0K0K1D8
failed to load pdb struct /home/

  sub_arys.append(_nx.swapaxes(sary[st:end], axis, 0))


failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A191URJ7-F1-model_v3.pdb.gz for A0A191URJ7
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GTC6-F1-model_v3.pdb.gz for A0A1B0GTC6
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GTD5-F1-model_v3.pdb.gz for A0A1B0GTD5
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GTE1-F1-model_v3.pdb.gz for A0A1B0GTE1
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GTG8-F1-model_v3.pdb.gz for A0A1B0GTG8
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GTH6-F1-model_v3.pdb.gz for A0A1B0GTH6
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GTH9-F1-model_v3.pdb.gz for A0A1B0GTH9
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GTI1-F1-model_v3.pdb.gz for A0A1B0GTI1
failed to load pdb struct /home/

failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GVY4-F1-model_v3.pdb.gz for A0A1B0GVY4
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GVZ2-F1-model_v3.pdb.gz for A0A1B0GVZ2
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GVZ6-F1-model_v3.pdb.gz for A0A1B0GVZ6
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GVZ9-F1-model_v3.pdb.gz for A0A1B0GVZ9
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GW15-F1-model_v3.pdb.gz for A0A1B0GW15
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GW35-F1-model_v3.pdb.gz for A0A1B0GW35
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GW54-F1-model_v3.pdb.gz for A0A1B0GW54
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A1B0GW64-F1-model_v3.pdb.gz for A0A1B0GW64
failed to load pdb struct /home/

failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A286YF58-F1-model_v3.pdb.gz for A0A286YF58
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A286YF60-F1-model_v3.pdb.gz for A0A286YF60
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A286YF77-F1-model_v3.pdb.gz for A0A286YF77
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A286YFB4-F1-model_v3.pdb.gz for A0A286YFB4
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A286YFG1-F1-model_v3.pdb.gz for A0A286YFG1
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A286YFK9-F1-model_v3.pdb.gz for A0A286YFK9
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A2R8Y2Y2-F1-model_v3.pdb.gz for A0A2R8Y2Y2
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A2R8Y422-F1-model_v3.pdb.gz for A0A2R8Y422
failed to load pdb struct /home/

failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A5F9ZHI8-F1-model_v3.pdb.gz for A0A5F9ZHI8
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A5F9ZHS0-F1-model_v3.pdb.gz for A0A5F9ZHS0
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A5F9ZHS7-F1-model_v3.pdb.gz for A0A5F9ZHS7
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A5F9ZHU2-F1-model_v3.pdb.gz for A0A5F9ZHU2
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A5K1VDZ0-F1-model_v3.pdb.gz for A0A5K1VDZ0
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A5S8K742-F1-model_v3.pdb.gz for A0A5S8K742
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A669KAW2-F1-model_v3.pdb.gz for A0A669KAW2
failed to load pdb struct /home/adunford/afref/UP000005640_9606_HUMAN_v3/AF-A0A669KB60-F1-model_v3.pdb.gz for A0A669KB60
failed to load pdb struct /home/

no amino acid sequences longer than 10 of sufficient quality for Q8IUC2
no amino acid sequences longer than 10 of sufficient quality for Q8IUC3
no amino acid sequences longer than 10 of sufficient quality for Q8IVF2
no amino acid sequences longer than 10 of sufficient quality for Q8IVF2
no amino acid sequences longer than 10 of sufficient quality for Q8IVF2
no amino acid sequences longer than 10 of sufficient quality for Q8IVF2
no amino acid sequences longer than 10 of sufficient quality for Q8IVF2
no amino acid sequences longer than 10 of sufficient quality for Q8IVF2
no amino acid sequences longer than 10 of sufficient quality for Q8IVF2
no amino acid sequences longer than 10 of sufficient quality for Q8IVF2
no amino acid sequences longer than 10 of sufficient quality for Q8IVF2
no amino acid sequences longer than 10 of sufficient quality for Q8IVF2
no amino acid sequences longer than 10 of sufficient quality for Q8IWN6
no amino acid sequences longer than 10 of sufficient quality for

length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while genomic length is 11400
length mismatch for Q8WZ42.  AF struct is length 1400, while geno

no amino acid sequences longer than 10 of sufficient quality for Q9BYQ7
no amino acid sequences longer than 10 of sufficient quality for Q9BYQ9
no amino acid sequences longer than 10 of sufficient quality for Q9BYR0
no amino acid sequences longer than 10 of sufficient quality for Q9BYR9
no amino acid sequences longer than 10 of sufficient quality for Q9BYT5
no amino acid sequences longer than 10 of sufficient quality for Q9BYU5
no amino acid sequences longer than 10 of sufficient quality for Q9GZL8
no amino acid sequences longer than 10 of sufficient quality for Q9H0A3
no amino acid sequences longer than 10 of sufficient quality for Q9H3A6
no amino acid sequences longer than 10 of sufficient quality for Q9H5L9
no amino acid sequences longer than 10 of sufficient quality for Q9H8Q6
no amino acid sequences longer than 10 of sufficient quality for Q9H8X3
no amino acid sequences longer than 10 of sufficient quality for Q9P1J3
no amino acid sequences longer than 10 of sufficient quality for