In [1]:
import os
import sys
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
from Bio.SeqUtils import seq3, seq1

import difflib
import Levenshtein

sys.path.append('../common')
import data_io_utils
import paths
import constants
import utils
import plot_style_utils

sys.path.append('../A008_analyze_chip_1/')
import A008_common

%reload_ext autoreload
%autoreload 2

In [2]:
print(constants.BETA_LAC_AA_SEQ) # WT beta lactamase aa seq

MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW


In [3]:
len(constants.BETA_LAC_AA_SEQ)

286

In [4]:
# confirm loading (won't use this now)
# Generated in mlpe-gfp-pilot repository
lfe_data_file = os.path.join(data_io_utils.S3_DATA_ROOT, 'chip_1', 
        'A052e_BLAC_log_fold_enrichment.csv')

data_io_utils.sync_s3_path_to_local(lfe_data_file, is_single_file=True)

df = pd.read_csv(lfe_data_file)

q = df['id'].apply(lambda s: pd.Series(A008_common.split_seq_id_into_features(s)))
df = df.merge(q, left_index=True, right_index=True)

RELEVANT_COLS = ['lfe_250', 'lfe_1000', 'lfe_2500', 'seq', 'model', 'ntrain', 'rep', 'special_case']

df_blac = df[RELEVANT_COLS]

print(df_blac.shape)
display(df_blac.head())

(9679, 8)


Unnamed: 0,lfe_250,lfe_1000,lfe_2500,seq,model,ntrain,rep,special_case
0,-1.003346,-2.423684,-2.469612,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,ET_Random_Init_1,96,3,
1,-1.207588,-2.195691,-2.400481,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,OneHot,24,2,
2,-1.449553,-1.769109,-2.618223,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,OneHot,96,0,
3,-1.268493,-2.465868,-2.812826,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,OneHot,96,3,
4,-1.417045,-1.590053,-2.227124,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,ET_Random_Init_1,96,3,


In [5]:
# 2 options to start with- 1zg4 and 5hvi. You will need to manually download the Fasta files.
# and check for similarity to our wt
pdb_17g4 = !head -n100 1zg4.fasta
pdb_17g4

['>1ZG4:A|PDBID|CHAIN|SEQUENCE',
 'MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLS',
 'RIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRL',
 'DRWEPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGS',
 'RGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW']

In [6]:
pdb_17g4 = "".join(pdb_17g4[1:])
pdb_17g4

'MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW'

In [7]:
len(pdb_17g4)

286

In [8]:
Levenshtein.editops(constants.BETA_LAC_AA_SEQ, pdb_17g4)

[('replace', 81, 81), ('replace', 181, 181)]

In [9]:
# Ok thats not bad. Lets try the next (5hvi)
pdb_5hvi = !head -n100 5hvi.fasta
pdb_5hvi

['>5HVI:A|PDBID|CHAIN|SEQUENCE',
 'HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEY',
 'SPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTTPAA',
 'MATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTG',
 'SQATMDERNRQIAEIGASLIKHW',
 '>5HVI:B|PDBID|CHAIN|SEQUENCE',
 'HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEY',
 'SPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTTPAA',
 'MATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTG',
 'SQATMDERNRQIAEIGASLIKHW',
 '>5HVI:C|PDBID|CHAIN|SEQUENCE',
 'HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEY',
 'SPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTTPAA',
 'MATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTG',
 'SQATMDERNRQIAEIGASLIKHW',
 '>5HVI:D|PDBID|CHAIN|SEQUENCE',
 'HPETLVKVKDAEDQLGA

In [10]:
pdb_5hvi = "".join(pdb_5hvi[1:]).split('>')[0]
pdb_5hvi

'HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTTPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW'

In [11]:
len(pdb_5hvi)

263

In [12]:
Levenshtein.editops(constants.BETA_LAC_AA_SEQ, pdb_5hvi)

[('delete', 0, 0),
 ('delete', 1, 0),
 ('delete', 2, 0),
 ('delete', 3, 0),
 ('delete', 4, 0),
 ('delete', 5, 0),
 ('delete', 6, 0),
 ('delete', 7, 0),
 ('delete', 8, 0),
 ('delete', 9, 0),
 ('delete', 10, 0),
 ('delete', 11, 0),
 ('delete', 12, 0),
 ('delete', 13, 0),
 ('delete', 14, 0),
 ('delete', 15, 0),
 ('delete', 16, 0),
 ('delete', 17, 0),
 ('delete', 18, 0),
 ('delete', 19, 0),
 ('delete', 20, 0),
 ('delete', 21, 0),
 ('delete', 22, 0),
 ('replace', 179, 156)]

In [13]:
# Ok, we're using 1zg4. Download the structure
!wget https://files.rcsb.org/download/1ZG4.pdb

--2019-12-16 09:54:46--  https://files.rcsb.org/download/1ZG4.pdb
Resolving files.rcsb.org (files.rcsb.org)... 128.6.244.12
Connecting to files.rcsb.org (files.rcsb.org)|128.6.244.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/octet-stream]
Saving to: ‘1ZG4.pdb.3’

1ZG4.pdb.3              [ <=>                  ] 204.87K  --.-KB/s   in 0.1s   

2019-12-16 09:54:46 (1.60 MB/s) - ‘1ZG4.pdb.3’ saved [209790]



In [14]:
p = PDBParser()
structure = p.get_structure("1ZG4", "./1ZG4.pdb")
model = structure[0]

In [15]:
all_residues = [a.__repr__() for a in list(structure[0].get_chains())[0]]
all_residues

['<Residue HIS het=  resseq=26 icode= >',
 '<Residue PRO het=  resseq=27 icode= >',
 '<Residue GLU het=  resseq=28 icode= >',
 '<Residue THR het=  resseq=29 icode= >',
 '<Residue LEU het=  resseq=30 icode= >',
 '<Residue VAL het=  resseq=31 icode= >',
 '<Residue LYS het=  resseq=32 icode= >',
 '<Residue VAL het=  resseq=33 icode= >',
 '<Residue LYS het=  resseq=34 icode= >',
 '<Residue ASP het=  resseq=35 icode= >',
 '<Residue ALA het=  resseq=36 icode= >',
 '<Residue GLU het=  resseq=37 icode= >',
 '<Residue ASP het=  resseq=38 icode= >',
 '<Residue GLN het=  resseq=39 icode= >',
 '<Residue LEU het=  resseq=40 icode= >',
 '<Residue GLY het=  resseq=41 icode= >',
 '<Residue ALA het=  resseq=42 icode= >',
 '<Residue ARG het=  resseq=43 icode= >',
 '<Residue VAL het=  resseq=44 icode= >',
 '<Residue GLY het=  resseq=45 icode= >',
 '<Residue TYR het=  resseq=46 icode= >',
 '<Residue ILE het=  resseq=47 icode= >',
 '<Residue GLU het=  resseq=48 icode= >',
 '<Residue LEU het=  resseq=49 ico

In [16]:
all_residues = pd.DataFrame(zip(*[ a.split(' ') for a in all_residues])).T

In [17]:
all_residues

Unnamed: 0,0,1,2,3,4,5
0,<Residue,HIS,het=,,resseq=26,icode=
1,<Residue,PRO,het=,,resseq=27,icode=
2,<Residue,GLU,het=,,resseq=28,icode=
3,<Residue,THR,het=,,resseq=29,icode=
4,<Residue,LEU,het=,,resseq=30,icode=
5,<Residue,VAL,het=,,resseq=31,icode=
6,<Residue,LYS,het=,,resseq=32,icode=
7,<Residue,VAL,het=,,resseq=33,icode=
8,<Residue,LYS,het=,,resseq=34,icode=
9,<Residue,ASP,het=,,resseq=35,icode=


In [18]:
all_residues = all_residues.iloc[:, [1,2]]
all_residues['obj'] = list(list(structure[0].get_chains())[0].get_residues())
all_residues

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,1,2,obj
0,HIS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
1,PRO,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
3,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
5,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
6,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
7,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
8,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
9,ASP,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."


In [19]:
all_residues[2].value_counts()

het=     263
het=W    194
Name: 2, dtype: int64

In [20]:
all_residues = all_residues[(~(all_residues.iloc[:,1] == 'het=W' ))]
all_residues

Unnamed: 0,1,2,obj
0,HIS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
1,PRO,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
3,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
5,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
6,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
7,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
8,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
9,ASP,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."


In [21]:
all_residues['aa'] = all_residues.iloc[:,0].map(seq1)
all_residues

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,1,2,obj,aa
0,HIS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",H
1,PRO,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",P
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E
3,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",L
5,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V
6,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",K
7,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V
8,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",K
9,ASP,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",D


In [22]:
# find the catalytic serine S70 coords
serine_index = 70 - 26
all_residues.loc[serine_index,:]

1                                                    SER
2                                                   het=
obj    (<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...
aa                                                     S
Name: 44, dtype: object

In [23]:
for a in all_residues.loc[serine_index,'obj'].get_atoms():
    print(a)
    print(a.get_coord())

<Atom N>
[10.113  4.607 35.264]
<Atom CA>
[ 8.811  4.71  35.903]
<Atom C>
[ 7.694  4.002 35.177]
<Atom O>
[ 6.556  3.871 35.663]
<Atom CB>
[ 8.463  6.202 36.058]
<Atom OG>
[ 9.293  6.718 37.108]


In [24]:
# Atom 'OG' is the side chain oxygen (catalytic)
catalytic_coord = all_residues.loc[serine_index,'obj']['OG'].get_coord()
catalytic_coord

array([ 9.293,  6.718, 37.108], dtype=float32)

In [25]:
def get_coord(x):
    try:
        return x['CA'].get_coord()
    except KeyError:
        if len(x) > 1:
            print("There is a non-CA atom")
            print(x)
            return np.nan
        else:
            print("Found one degenerate atom")
            print(x)
            return np.nan

In [26]:
# Set coord to the CA coord
all_residues['coord'] = all_residues['obj'].map(get_coord)
all_residues

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,1,2,obj,aa,coord
0,HIS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",H,"[3.076, 1.906, 8.748]"
1,PRO,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",P,"[6.665, 1.888, 7.407]"
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E,"[6.739, 5.638, 7.272]"
3,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T,"[6.61, 5.515, 11.036]"
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",L,"[9.726, 3.395, 10.977]"
5,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[11.311, 6.072, 8.894]"
6,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",K,"[10.697, 8.501, 11.764]"
7,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[11.927, 6.064, 14.356]"
8,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",K,"[15.168, 5.591, 12.48]"
9,ASP,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",D,"[15.503, 9.325, 12.099]"


In [27]:
all_residues.isnull().any()

1        False
2        False
obj      False
aa       False
coord    False
dtype: bool

In [28]:
# Nothing degenerate. confirm that the catalytic serine also got assigned its alpha C
all_residues.loc[serine_index,:]

1                                                      SER
2                                                     het=
obj      (<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...
aa                                                       S
coord                                [8.811, 4.71, 35.903]
Name: 44, dtype: object

In [29]:
# now get distance from serine
all_residues['d_from_active'] = all_residues.coord.map(lambda c: np.linalg.norm(c - catalytic_coord))
all_residues

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,1,2,obj,aa,coord,d_from_active
0,HIS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",H,"[3.076, 1.906, 8.748]",29.429510
1,PRO,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",P,"[6.665, 1.888, 7.407]",30.205709
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E,"[6.739, 5.638, 7.272]",29.964584
3,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T,"[6.61, 5.515, 11.036]",26.237284
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",L,"[9.726, 3.395, 10.977]",26.344999
5,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[11.311, 6.072, 8.894]",28.293453
6,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",K,"[10.697, 8.501, 11.764]",25.445408
7,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[11.927, 6.064, 14.356]",22.913300
8,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",K,"[15.168, 5.591, 12.48]",25.344118
9,ASP,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",D,"[15.503, 9.325, 12.099]",25.900015


In [30]:
all_residues.d_from_active.min()

2.3909029960632324

In [31]:
all_residues.d_from_active.max()

30.20570945739746

In [32]:
dssp = DSSP(model, "./1ZG4.pdb")

In [33]:
# The format of this is an aweful list of tuples.
index, aa, ss, rel_ASA, *rest = zip(*list(dssp))

In [34]:
aa[:10]

('H', 'P', 'E', 'T', 'L', 'V', 'K', 'V', 'K', 'D')

In [35]:
blac_structure_data = pd.DataFrame(
{
    'dssp_index':index,
    'aa':aa,
    'ss':ss,
    'rel_ASA': rel_ASA

})

In [36]:
blac_structure_data

Unnamed: 0,dssp_index,aa,ss,rel_ASA
0,1,H,-,0.461957
1,2,P,H,0.558824
2,3,E,H,0.520619
3,4,T,H,0.007042
4,5,L,H,0.280488
5,6,V,H,0.640845
6,7,K,H,0.156098
7,8,V,H,0.000000
8,9,K,H,0.521951
9,10,D,H,0.349693


In [37]:
max_ASA = {seq1(k):v for k,v in dssp.residue_max_acc.items()}
max_ASA

{'A': 106.0,
 'R': 248.0,
 'N': 157.0,
 'D': 163.0,
 'C': 135.0,
 'Q': 198.0,
 'E': 194.0,
 'G': 84.0,
 'H': 184.0,
 'I': 169.0,
 'L': 164.0,
 'K': 205.0,
 'M': 188.0,
 'F': 197.0,
 'P': 136.0,
 'S': 130.0,
 'T': 142.0,
 'W': 227.0,
 'Y': 222.0,
 'V': 142.0}

In [38]:
ASA = []
for i,row in blac_structure_data.iterrows():
    ASA.append(row['rel_ASA'] * max_ASA[row['aa']])

ASA[:10]

[85.0,
 76.0,
 101.00000000000001,
 1.0,
 46.00000000000001,
 91.0,
 32.0,
 0.0,
 107.0,
 57.0]

In [39]:
blac_structure_data['ASA'] = ASA

In [40]:
blac_structure_data

Unnamed: 0,dssp_index,aa,ss,rel_ASA,ASA
0,1,H,-,0.461957,85.0
1,2,P,H,0.558824,76.0
2,3,E,H,0.520619,101.0
3,4,T,H,0.007042,1.0
4,5,L,H,0.280488,46.0
5,6,V,H,0.640845,91.0
6,7,K,H,0.156098,32.0
7,8,V,H,0.000000,0.0
8,9,K,H,0.521951,107.0
9,10,D,H,0.349693,57.0


In [41]:
# now merge with coords
# test that amino acids are identical
for i in range(len(blac_structure_data)):
    assert blac_structure_data.iloc[i, 1] == all_residues.iloc[i, 3]

In [43]:
blac_structure_data['coord'] = all_residues['coord']
blac_structure_data['d_from_active'] = all_residues['d_from_active']
blac_structure_data

Unnamed: 0,dssp_index,aa,ss,rel_ASA,ASA,coord,d_from_active
0,1,H,-,0.461957,85.0,"[3.076, 1.906, 8.748]",29.429510
1,2,P,H,0.558824,76.0,"[6.665, 1.888, 7.407]",30.205709
2,3,E,H,0.520619,101.0,"[6.739, 5.638, 7.272]",29.964584
3,4,T,H,0.007042,1.0,"[6.61, 5.515, 11.036]",26.237284
4,5,L,H,0.280488,46.0,"[9.726, 3.395, 10.977]",26.344999
5,6,V,H,0.640845,91.0,"[11.311, 6.072, 8.894]",28.293453
6,7,K,H,0.156098,32.0,"[10.697, 8.501, 11.764]",25.445408
7,8,V,H,0.000000,0.0,"[11.927, 6.064, 14.356]",22.913300
8,9,K,H,0.521951,107.0,"[15.168, 5.591, 12.48]",25.344118
9,10,D,H,0.349693,57.0,"[15.503, 9.325, 12.099]",25.900015


In [44]:
blac_structure_data.to_csv('blac_structure_info.csv')

In [45]:
blac_structure_data.isnull().any(axis=0)

dssp_index       False
aa               False
ss               False
rel_ASA          False
ASA              False
coord            False
d_from_active    False
dtype: bool