In [1]:
# 1. Reads the output from 1_pdb_resContacts_byDistance.pl on 5f1b
# 2. Extracts the interfacial residues based on a distance cutoff provided by the user
# 3. Reads the MSA
# 4. For each sequence in the MSA, it reports the amino acid seq (fasta format) using only interfacial residues

In [2]:
import pandas as pd
from Bio import AlignIO
import re

In [3]:
# Input params
query = 'host' #it can only be 'virus' or 'host' or 'hosthomog'
cutoff = 7 # final model uses distances cutoffs of 7 and 15 ang
# change root dir accordingly
rootdir = '/Users/gorkalasso/Documents/GitHub/filo_GP-bat_NPC1/scr/seq_to_feat/'
outdir = rootdir + f'output/2_interface_2_fasta/{cutoff}/'

In [4]:
# Files based on user-defined input
interf_fl = rootdir + 'output/1_5f1b_interface_summary.xlsx'
interf_fl_sheet = ''
msa_fl = '' # MSA file
res_in_msa = 0 # number of first residue in MSA
ref_rec = '' # pdb accession number that will be used as reference while reading the MSA
if query == 'virus':
    print('query is virus')
    interf_fl_sheet = 'up_to_1000_gp1'
    msa_fl = rootdir + 'input/1_gp1.aln'
    res_in_msa = 31
    ref_rec = '5F1B_A'
elif query == 'host':
    print('query is host')
    interf_fl_sheet = 'up_to_1000_npc1'
    msa_fl = rootdir + 'input/1_npc1.aln'
    res_in_msa = 373
    ref_rec = '5F1B_C'
outfile = outdir + 'interface_' + query + '_' + str(cutoff) + '.txt'

query is host


In [5]:
# Interfacial residues
#   Getting the 3-letter to 1-letter code
df_aa = pd.read_excel(rootdir + 'input/0_amino_acid_table.xlsx')
df_aa.set_index('3-letter code',inplace = True)
#  Extracting interfacial residues based on distance cutoff 
df_t = pd.read_excel(interf_fl, sheet_name = interf_fl_sheet)
df_interf = df_t[df_t['distance'] < cutoff].copy()
for index, row in df_interf.iterrows():
#     print(index)
    res = str.lower(df_interf.at[index,'res_id'])
    res = res.capitalize()
    onelet = df_aa.at[res,'1-letter code']
#     print(res, onelet)
    df_interf.at[index, '1_letter_code'] = onelet
display(df_interf)

Unnamed: 0,Chain,res_id,res_number,contact,distance,1_letter_code
29,C,ILE,419,"sidechain,backbone",5.4,I
30,C,TYR,420,"sidechain,backbone",3.5,Y
31,C,GLN,421,"sidechain,backbone",3.1,Q
32,C,PRO,422,"backbone,sidechain",3.7,P
33,C,TYR,423,"backbone,sidechain",3.4,Y
34,C,PRO,424,"backbone,sidechain",2.8,P
35,C,SER,425,"backbone,sidechain",3.5,S
36,C,GLY,426,backbone,3.3,G
37,C,ALA,427,"backbone,sidechain",5.3,A
38,C,ASP,428,"backbone,sidechain",4.3,D


In [6]:
res_numb_ls = df_interf.res_number.values.tolist()
fh = open(outdir + 'interfpos_' + query + '_' + str(cutoff) + '.txt', 'w')
s = ''
for item in res_numb_ls:
    item = query[0] + str(item)
    if s == '':
        s = item
    else:
        s += ', ' + item
print(s, file=fh)
fh.close()

In [7]:
# ali = bio.AlignIO.read(msa_fl, 'clustal')
ali = AlignIO.read(msa_fl, "clustal")
print(format(ali, 'clustal'))

CLUSTAL X (1.81) multiple sequence alignment


5F1B_C/1-249                        MTTNPVDLWSAPSSQARLEKEYFDQHFGPFFRTEQLIIRAPLTDKHIYQP
NPC1_human/3-251                    VTTNPVDLWSAPSSQARLEKEYFDQHFGPFFRTEQLIIRAPLTDKHIYQP
Nycteris_hispida/3-251              ITTNPVDLWSAPSSQARLEKEYFDTHFGPFFRTEQLIIRAPNTDVHIYQP
Rhinopoma_microphyllum/3-251        ITTNPVDLWSAPSSQARLEKEYFDTHFGPFFRTEQLIIRAPHTDVHIYQP
Pteronotus_quadridens/3-251         VTTNPVDLWSAPNSQARLEKEYFDTHFGPFFRTEQLIIQAPNTHVHIYQP
Cynopterus_sphinx/3-251             VTTNPVDLWSAPSSQARLEKEYFDTHFGPFFRTEQLIIRAPHTATHTYQP
Coleura_afra/3-251                  VTTNPVDLWSAPSSQAHLEKEYFDTHFGPFFRTEQLIIQAPHTDIHTYEP
Diaemus_youngi/3-251                VTTNPVDLWSAPSSQARLEKEYFDTHFGPFFRTEQLIIRAPHTHTHTYQP
Hipposideros_armiger/3-251          VTTNPVDLWSAPSSQARLEKEYFDTHFGPFFRTEQLIIRAPHTDIHTYQP
Desmodus_rotundus/3-251             VTTNPVDLWSAPSSQARLEKEYFDTHFGPFFRTEQLIIRAPHTPTHTYQP
Triaenops_persicus/3-251            VTTNPVDLWSAPSSQARLEKEYFDIHFGPFFRTEQLIIRAPHTDIHT

In [8]:
# get reference sequence and initialize interface dictionary
ref_seq = ''
dc_interf = {}
for record in ali:
#     print(record.id, record.seq)
    if ref_rec in record.id:
        ref_seq = record.seq
        print('Extracted as reference sequence: ' + record.id)
    else:
        dc_interf[record.id] = ''
print(ref_seq)

Extracted as reference sequence: 5F1B_C/1-249
MTTNPVDLWSAPSSQARLEKEYFDQHFGPFFRTEQLIIRAPLTDKHIYQPYPSGADVPFGPPLDIQILHQVLDLQIAIENITASYDNETVTLQDICLAPLSPYNTNCTILSVLNYFQNSHSVLDHKKGDDFFVYADYHTHFLYCVRAPASLNDTSLLHDPCLGTFGGPVFPWLVLGGYDDQNYNNATALVITFPVNNYYNDTEKLQRAQAWEKEFINFVKNYKNPNLTISFTAERSIEDELNRESDSDL


In [9]:
# get inferred interfacial residues
resnumb = res_in_msa-1
for i,c in enumerate(ref_seq):
    if c != '-':
        resnumb += 1
    else:
        continue
#     print(i, resnumb, c, sep='\t')
    if resnumb in df_interf['res_number'].values:
        idx = df_interf.index[df_interf['res_number'] == resnumb].tolist()[0]
        interfRes = df_interf.at[idx,'1_letter_code']
        print(i, resnumb, c, '->', 'Interfacial', 'index: ' + str(idx), 'resid in df: ' + interfRes, sep='\t')
        if interfRes != c:
            print('Fatal error, residues dont match')
            quit()
        for record in ali:
            if ref_rec in record.id:
                continue
#             print(record.id)
            newres = record.seq[i]
            dc_interf[record.id] = dc_interf[record.id] + newres   

46	419	I	->	Interfacial	index: 29	resid in df: I
47	420	Y	->	Interfacial	index: 30	resid in df: Y
48	421	Q	->	Interfacial	index: 31	resid in df: Q
49	422	P	->	Interfacial	index: 32	resid in df: P
50	423	Y	->	Interfacial	index: 33	resid in df: Y
51	424	P	->	Interfacial	index: 34	resid in df: P
52	425	S	->	Interfacial	index: 35	resid in df: S
53	426	G	->	Interfacial	index: 36	resid in df: G
54	427	A	->	Interfacial	index: 37	resid in df: A
55	428	D	->	Interfacial	index: 38	resid in df: D
125	498	K	->	Interfacial	index: 108	resid in df: K
126	499	K	->	Interfacial	index: 109	resid in df: K
127	500	G	->	Interfacial	index: 110	resid in df: G
128	501	D	->	Interfacial	index: 111	resid in df: D
129	502	D	->	Interfacial	index: 112	resid in df: D
130	503	F	->	Interfacial	index: 113	resid in df: F
131	504	F	->	Interfacial	index: 114	resid in df: F
132	505	V	->	Interfacial	index: 115	resid in df: V
133	506	Y	->	Interfacial	index: 116	resid in df: Y
135	508	D	->	Interfacial	index: 118	resid in df: D


In [10]:
print(dc_interf)

{'NPC1_human/3-251': 'IYQPYPSGADKKGDDFFVYDL', 'Nycteris_hispida/3-251': 'IYQPYPSGSDKIGDDFFVYDV', 'Rhinopoma_microphyllum/3-251': 'IYQPYPSGSDKIGDDFFVYDV', 'Pteronotus_quadridens/3-251': 'IYQPYPSGSDKVGDYFFVYDV', 'Cynopterus_sphinx/3-251': 'TYQPYPAGSDKIGDDFFVYDL', 'Coleura_afra/3-251': 'TYEPYPSGSSKVGDDFYVYDL', 'Diaemus_youngi/3-251': 'TYQPYPSGSDKIGDPFYVYDV', 'Hipposideros_armiger/3-251': 'TYQPYPAGSDTIGDDFYVYDL', 'Desmodus_rotundus/3-251': 'TYQPYPSGSDKIGDAFYVYDV', 'Triaenops_persicus/3-251': 'TYQPYPAGSDKIGDDFYVYDL', 'Miniopterus_inflatus/3-251': 'TYEPYPSGSDKIGDDFYVYDL', 'Hipposideros_jonesi/3-251': 'TYQPYPAGSDEIGDDFYVYDL', 'Brachyphylla_cavernarum/3-251': 'TYQPYPSGSDKIGDPFYVYDV', 'Myonycteris_angolensis/3-251': 'TYQPYPAGSDKIGDDFFVYDL', 'Pteronotus_parnellii/3-251': 'IYKPYPSGSDKVGDYFYVYDV', 'Natalus_stramineus/3-251': 'TYQPYPSGSDKIGDDFFVYDM', 'Hipposideros_ruber/3-251': 'TYQPYPAGSDKIGDDFYVYDL', 'Vampyressa_bidens/3-251': 'IYQPYPSGSDKIGDPFYVYDV', 'Rhinolophus_shortridgei/3-251': 'TYEPYPSGSDK

In [11]:
# Editing the headers in dictionary
dc_interf_edit = {}
for key in dc_interf.keys():
    newkey = re.sub('\/.*','',key)
    print(key, newkey)
    dc_interf_edit[newkey] = dc_interf[key]

NPC1_human/3-251 NPC1_human
Nycteris_hispida/3-251 Nycteris_hispida
Rhinopoma_microphyllum/3-251 Rhinopoma_microphyllum
Pteronotus_quadridens/3-251 Pteronotus_quadridens
Cynopterus_sphinx/3-251 Cynopterus_sphinx
Coleura_afra/3-251 Coleura_afra
Diaemus_youngi/3-251 Diaemus_youngi
Hipposideros_armiger/3-251 Hipposideros_armiger
Desmodus_rotundus/3-251 Desmodus_rotundus
Triaenops_persicus/3-251 Triaenops_persicus
Miniopterus_inflatus/3-251 Miniopterus_inflatus
Hipposideros_jonesi/3-251 Hipposideros_jonesi
Brachyphylla_cavernarum/3-251 Brachyphylla_cavernarum
Myonycteris_angolensis/3-251 Myonycteris_angolensis
Pteronotus_parnellii/3-251 Pteronotus_parnellii
Natalus_stramineus/3-251 Natalus_stramineus
Hipposideros_ruber/3-251 Hipposideros_ruber
Vampyressa_bidens/3-251 Vampyressa_bidens
Rhinolophus_shortridgei/3-251 Rhinolophus_shortridgei
Balantiopteryx_plicata/3-251 Balantiopteryx_plicata
Hipposideros_larvatus/3-251 Hipposideros_larvatus
Mormoops_megalophylla/3-251 Mormoops_megalophylla
Mo

In [12]:
fl = open(outfile,'w')
for key, val in dc_interf_edit.items():  
    print(f'>{key}', file=fl)
    print(val, file=fl)
fl.close()

In [13]:
print('Finito')

Finito
