In [1]:
import os
import sys
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
from Bio.SeqUtils import seq3, seq1

import difflib
import Levenshtein

sys.path.append('../common')
import data_io_utils
import paths
import constants
import utils
import plot_style_utils

sys.path.append('../A008_analyze_chip_1/')
import A008_common

%reload_ext autoreload
%autoreload 2

In [2]:
# Test loading
print(constants.AVGFP_AA_SEQ) ## WT avGFP aa seq

MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK


In [3]:
inf_brightness_data_file = os.path.join(data_io_utils.S3_DATA_ROOT, 'chip_1', 'A052b_Chip_1_inferred_brightness_v2.csv')

data_io_utils.sync_s3_path_to_local(inf_brightness_data_file, is_single_file=True)

df = pd.read_csv(inf_brightness_data_file)

q = df['id'].apply(lambda s: pd.Series(A008_common.split_seq_id_into_features(s)))
df = df.merge(q, left_index=True, right_index=True)

RELEVANT_COLS = ['id', 'qfunc', 'seq', 'model', 'ntrain', 'rep', 'special_case']
df_gfp = df[RELEVANT_COLS]

print(df_gfp.shape)
display(df_gfp.head())

(14306, 7)


Unnamed: 0,id,qfunc,seq,model,ntrain,rep,special_case
0,GFP_SimAnneal-ET_Global_Init_2-0024-04-36079a2...,2.205245,MSKGEELFTGVVPILVELDGDVNGHKFSVKTEGPRDATYGKMTPKF...,ET_Global_Init_2,24.0,4.0,
1,GFP_SimAnneal-OneHot-0024-04-4279eeb-seq_idx_2...,1.945739,MSKGEELFTGVVPILVELDGDVNGHKFSVKTEGEGDATYGKLSLKF...,OneHot,24.0,4.0,
2,GFP_SimAnneal-ET_Random_Init_1-0024-04-1989c09...,2.079981,MSKGEELFTGVVPILVELDGDVNGHKFSVKMEGEGNPTGGKLIQKF...,ET_Random_Init_1,24.0,4.0,
3,GFP_SimAnneal-ET_Global_Init_1-0024-04-4502d3-...,2.180828,MSKGEELFTGVVPILVELDGDVNGHKFSVKGIGEGDATMGKLTIRF...,ET_Global_Init_1,24.0,4.0,
4,GFP_SimAnneal-ET_Global_Init_1-0024-04-4502d3-...,2.670093,MSKGEELFTGVVPILVELDGDVNGHKFSVKGQGEGEATYGKLTLKF...,ET_Global_Init_1,24.0,4.0,


In [4]:
df_gfp[df_gfp['seq'] == constants.AVGFP_AA_SEQ]

Unnamed: 0,id,qfunc,seq,model,ntrain,rep,special_case
6492,GFP_SimAnneal-ET_Global_Init_1-0024-00-3e72164...,3.410765,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,ET_Global_Init_1,24.0,0.0,
6493,GFP_SimAnneal-ET_Global_Init_1-0024-03-3764e94...,3.410765,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,ET_Global_Init_1,24.0,3.0,
6494,GFP_SimAnneal-ET_Global_Init_1-0024-04-4502d3-...,3.410765,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,ET_Global_Init_1,24.0,4.0,
6495,GFP_SimAnneal-ET_Global_Init_1-0096-00-3ea5f6e...,3.410765,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,ET_Global_Init_1,96.0,0.0,
6496,GFP_SimAnneal-ET_Global_Init_1-0096-01-2db7371...,3.410765,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,ET_Global_Init_1,96.0,1.0,
6497,GFP_SimAnneal-ET_Global_Init_1-0096-01-2db7371...,3.410765,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,ET_Global_Init_1,96.0,1.0,
6498,GFP_SimAnneal-ET_Global_Init_1-0096-01-2db7371...,3.410765,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,ET_Global_Init_1,96.0,1.0,
6499,GFP_SimAnneal-ET_Global_Init_1-0096-01-2db7371...,3.410765,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,ET_Global_Init_1,96.0,1.0,
6500,GFP_SimAnneal-ET_Global_Init_1-0096-02-341cf5c...,3.410765,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,ET_Global_Init_1,96.0,2.0,
6501,GFP_SimAnneal-ET_Global_Init_1-0096-03-12da09f...,3.410765,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,ET_Global_Init_1,96.0,3.0,


In [5]:
# Download GFP
!wget https://files.rcsb.org/download/1EMA.pdb

--2019-12-16 12:55:30--  https://files.rcsb.org/download/1EMA.pdb
Resolving files.rcsb.org (files.rcsb.org)... 128.6.244.12
Connecting to files.rcsb.org (files.rcsb.org)|128.6.244.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/octet-stream]
Saving to: ‘1EMA.pdb.7’

1EMA.pdb.7              [ <=>                  ] 186.92K  --.-KB/s   in 0.07s  

2019-12-16 12:55:30 (2.56 MB/s) - ‘1EMA.pdb.7’ saved [191403]



In [6]:
# Download pdb fasta to confirm the sequences match
# for some reason, wget won't work so do it manually
# https://www.rcsb.org/pdb/download/downloadFastaFiles.do?structureIdList=1EMA&compressionType=uncompressed
pdb_seq = !head 1ema.fasta
pdb_seq


['>1EMA:A|PDBID|CHAIN|SEQUENCE',
 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFTYGVQCFSRYPDHMKR',
 'HDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNG',
 'IKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK']

In [7]:
pdb_seq = "".join(pdb_seq[1:])
pdb_seq

'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFTYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK'

In [8]:
constants.AVGFP_AA_SEQ == pdb_seq

False

In [9]:
len(constants.AVGFP_AA_SEQ )

238

In [10]:
len(pdb_seq)

238

In [11]:
Levenshtein.editops(constants.AVGFP_AA_SEQ, pdb_seq)

[('replace', 63, 63), ('replace', 64, 64), ('replace', 79, 79)]

In [12]:
print(f"at position 63, ours is {constants.AVGFP_AA_SEQ[63]} and theirs is {pdb_seq[63]}")
print(f"at position 64, ours is {constants.AVGFP_AA_SEQ[64]} and theirs is {pdb_seq[64]}")
print(f"at position 79, ours is {constants.AVGFP_AA_SEQ[79]} and theirs is {pdb_seq[79]}")

at position 63, ours is L and theirs is F
at position 64, ours is S and theirs is T
at position 79, ours is Q and theirs is R


In [13]:
# Lets see if any of the other structures are better.
pdb_seq2 = !head 1gfl.fasta
pdb_seq2


['>1GFL:A|PDBID|CHAIN|SEQUENCE',
 'ASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKR',
 'HDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNG',
 'IKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK',
 '>1GFL:B|PDBID|CHAIN|SEQUENCE',
 'ASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKR',
 'HDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNG',
 'IKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK']

In [14]:
pdb_seq2 = "".join(pdb_seq2[1:]).split('>')[0]
pdb_seq2

'ASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK'

In [15]:
len(pdb_seq2)

238

In [16]:
Levenshtein.editops(constants.AVGFP_AA_SEQ, pdb_seq2)

[('replace', 0, 0), ('replace', 63, 63), ('replace', 79, 79)]

In [17]:
# check one final one
pdb_seq3 = !head 2wur.fasta
pdb_seq3

['>2WUR:A|PDBID|CHAIN|SEQUENCE',
 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKR',
 'HDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNG',
 'IKVNFKTRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYN']

In [18]:
pdb_seq3 = "".join(pdb_seq3[1:])
pdb_seq3

'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKTRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYN'

In [19]:
len(pdb_seq3)

238

In [20]:
Levenshtein.editops(constants.AVGFP_AA_SEQ, pdb_seq3)

[('replace', 79, 79), ('replace', 166, 166), ('replace', 237, 237)]

In [21]:
print(f"at position 79, ours is {constants.AVGFP_AA_SEQ[79]} and theirs is {pdb_seq3[79]}")
print(f"at position 166, ours is {constants.AVGFP_AA_SEQ[166]} and theirs is {pdb_seq3[166]}")
print(f"at position 237, ours is {constants.AVGFP_AA_SEQ[237]} and theirs is {pdb_seq3[237]}")

at position 79, ours is Q and theirs is R
at position 166, ours is I and theirs is T
at position 237, ours is K and theirs is N


In [22]:
# Ok, so we can't win here. I think it is scandalous that these "structures of GFP"
# have 3 mutations. Will operate under the assumption that we can't find anyone better than
# this. 

# Lets use 2wur because it is the highest resolution (as if that matters, when it is evidently a different seq)
p = PDBParser()
structure = p.get_structure("2wur", "./2wur.pdb")


In [23]:
list(list(structure[0].get_chains())[0])[-1].__repr__()

'<Residue HOH het=W resseq=2320 icode= >'

In [24]:
list(structure[0].get_residues())[0]['CA'].get_coord()

array([27.02 ,  8.242, 15.493], dtype=float32)

In [25]:
model = structure[0]

In [26]:
dssp = DSSP(model, "./2wur.pdb")

In [27]:
# The format of this is an aweful list of tuples.
index, aa, ss, rel_ASA, *rest = zip(*list(dssp))

In [28]:
aa

('K',
 'G',
 'E',
 'E',
 'L',
 'F',
 'T',
 'G',
 'V',
 'V',
 'P',
 'I',
 'L',
 'V',
 'E',
 'L',
 'D',
 'G',
 'D',
 'V',
 'N',
 'G',
 'H',
 'K',
 'F',
 'S',
 'V',
 'S',
 'G',
 'E',
 'G',
 'E',
 'G',
 'D',
 'A',
 'T',
 'Y',
 'G',
 'K',
 'L',
 'T',
 'L',
 'K',
 'F',
 'I',
 'C',
 'T',
 'T',
 'G',
 'K',
 'L',
 'P',
 'V',
 'P',
 'W',
 'P',
 'T',
 'L',
 'V',
 'T',
 'T',
 'L',
 'V',
 'Q',
 'C',
 'F',
 'S',
 'R',
 'Y',
 'P',
 'D',
 'H',
 'M',
 'K',
 'R',
 'H',
 'D',
 'F',
 'F',
 'K',
 'S',
 'A',
 'M',
 'P',
 'E',
 'G',
 'Y',
 'V',
 'Q',
 'E',
 'R',
 'T',
 'I',
 'F',
 'F',
 'K',
 'D',
 'D',
 'G',
 'N',
 'Y',
 'K',
 'T',
 'R',
 'A',
 'E',
 'V',
 'K',
 'F',
 'E',
 'G',
 'D',
 'T',
 'L',
 'V',
 'N',
 'R',
 'I',
 'E',
 'L',
 'K',
 'G',
 'I',
 'D',
 'F',
 'K',
 'E',
 'D',
 'G',
 'N',
 'I',
 'L',
 'G',
 'H',
 'K',
 'L',
 'E',
 'Y',
 'N',
 'Y',
 'N',
 'S',
 'H',
 'N',
 'V',
 'Y',
 'I',
 'M',
 'A',
 'D',
 'K',
 'Q',
 'K',
 'N',
 'G',
 'I',
 'K',
 'V',
 'N',
 'F',
 'K',
 'T',
 'R',
 'H',
 'N',
 'I',
 'E'

In [29]:


gfp_structure_data = pd.DataFrame(
{
    'dssp_index':index,
    'aa':aa,
    'ss':ss,
    'rel_ASA': rel_ASA

})

In [30]:
gfp_structure_data

Unnamed: 0,dssp_index,aa,ss,rel_ASA
0,1,K,-,0.687805
1,2,G,G,0.059524
2,3,E,G,0.247423
3,4,E,G,0.711340
4,5,L,G,0.469512
5,6,F,G,0.020305
6,7,T,S,0.711268
7,8,G,S,0.392857
8,9,V,-,0.528169
9,10,V,E,0.070423


In [31]:
# the relative accessibility is the absolute accessibility divided 
# by the observed maximum accessibility in G-X-G peptide. See
# this discussion https://www.researchgate.net/post/Does_class_DSSP_of_biopython_gives_the_relative_solvent_accessibility_value_of_amino_acids
# I believe we want to save the absolute accessibility just in case, which will means we
# need to multiply by the Max to recover the absolute.
dssp.residue_max_acc

{'ALA': 106.0,
 'ARG': 248.0,
 'ASN': 157.0,
 'ASP': 163.0,
 'CYS': 135.0,
 'GLN': 198.0,
 'GLU': 194.0,
 'GLY': 84.0,
 'HIS': 184.0,
 'ILE': 169.0,
 'LEU': 164.0,
 'LYS': 205.0,
 'MET': 188.0,
 'PHE': 197.0,
 'PRO': 136.0,
 'SER': 130.0,
 'THR': 142.0,
 'TRP': 227.0,
 'TYR': 222.0,
 'VAL': 142.0}

In [32]:
max_ASA = {seq1(k):v for k,v in dssp.residue_max_acc.items()}
max_ASA

{'A': 106.0,
 'R': 248.0,
 'N': 157.0,
 'D': 163.0,
 'C': 135.0,
 'Q': 198.0,
 'E': 194.0,
 'G': 84.0,
 'H': 184.0,
 'I': 169.0,
 'L': 164.0,
 'K': 205.0,
 'M': 188.0,
 'F': 197.0,
 'P': 136.0,
 'S': 130.0,
 'T': 142.0,
 'W': 227.0,
 'Y': 222.0,
 'V': 142.0}

In [33]:
ASA = []
for i,row in gfp_structure_data.iterrows():
    ASA.append(row['rel_ASA'] * max_ASA[row['aa']])

ASA[:10]

[141.0,
 5.0,
 48.0,
 138.0,
 77.0,
 3.9999999999999996,
 101.00000000000001,
 33.0,
 75.0,
 10.0]

In [34]:
gfp_structure_data['ASA'] = ASA

In [35]:
gfp_structure_data

Unnamed: 0,dssp_index,aa,ss,rel_ASA,ASA
0,1,K,-,0.687805,141.0
1,2,G,G,0.059524,5.0
2,3,E,G,0.247423,48.0
3,4,E,G,0.711340,138.0
4,5,L,G,0.469512,77.0
5,6,F,G,0.020305,4.0
6,7,T,S,0.711268,101.0
7,8,G,S,0.392857,33.0
8,9,V,-,0.528169,75.0
9,10,V,E,0.070423,10.0


In [36]:
all_residues = [a.__repr__() for a in list(structure[0].get_chains())[0]]
all_residues

['<Residue LYS het=  resseq=3 icode= >',
 '<Residue GLY het=  resseq=4 icode= >',
 '<Residue GLU het=  resseq=5 icode= >',
 '<Residue GLU het=  resseq=6 icode= >',
 '<Residue LEU het=  resseq=7 icode= >',
 '<Residue PHE het=  resseq=8 icode= >',
 '<Residue THR het=  resseq=9 icode= >',
 '<Residue GLY het=  resseq=10 icode= >',
 '<Residue VAL het=  resseq=11 icode= >',
 '<Residue VAL het=  resseq=12 icode= >',
 '<Residue PRO het=  resseq=13 icode= >',
 '<Residue ILE het=  resseq=14 icode= >',
 '<Residue LEU het=  resseq=15 icode= >',
 '<Residue VAL het=  resseq=16 icode= >',
 '<Residue GLU het=  resseq=17 icode= >',
 '<Residue LEU het=  resseq=18 icode= >',
 '<Residue ASP het=  resseq=19 icode= >',
 '<Residue GLY het=  resseq=20 icode= >',
 '<Residue ASP het=  resseq=21 icode= >',
 '<Residue VAL het=  resseq=22 icode= >',
 '<Residue ASN het=  resseq=23 icode= >',
 '<Residue GLY het=  resseq=24 icode= >',
 '<Residue HIS het=  resseq=25 icode= >',
 '<Residue LYS het=  resseq=26 icode= >',

In [37]:
all_residues = pd.DataFrame(zip(*[ a.split(' ') for a in all_residues])).T

In [38]:
all_residues = all_residues.iloc[:, [1,2]]
all_residues['obj'] = list(list(structure[0].get_chains())[0].get_residues())
all_residues

Unnamed: 0,1,2,obj
0,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
1,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
3,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
5,PHE,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
6,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
7,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
8,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
9,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."


In [39]:
all_residues = all_residues[(~(all_residues.iloc[:,1] == 'het=W' )) & ~(all_residues.iloc[:,1] == 'het=H_EOH' ) & ~(all_residues.iloc[:,1] == 'het=H_IPA' )]
all_residues

Unnamed: 0,1,2,obj
0,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
1,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
3,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
5,PHE,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
6,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
7,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
8,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."
9,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato..."


In [40]:
all_residues['aa'] = all_residues.iloc[:,0].map(seq1)
all_residues[~(all_residues.iloc[:,1] == 'het=')]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,1,2,obj,aa
62,GYS,het=H_GYS,"(<Atom N>, <Atom OG1>, <Atom CB1>, <Atom CA1>,...",X


In [41]:
all_residues.loc[0,'obj']['CA'].get_coord()

array([27.02 ,  8.242, 15.493], dtype=float32)

In [42]:
# get position of chromophore centroid 
chrom = all_residues[~(all_residues.iloc[:,1] == 'het=')]['obj'].values[0]
chrom

<Residue GYS het=H_GYS resseq=66 icode= >

In [43]:
chrom_coords = np.array([a.get_coord() for a in chrom.get_atoms()])
chrom_coords

array([[ 2.5403e+01,  2.6208e+01, -2.5900e-01],
       [ 2.8385e+01,  2.4079e+01, -6.5300e-01],
       [ 2.7437e+01,  2.5077e+01, -9.8200e-01],
       [ 2.6471e+01,  2.5277e+01,  1.5000e-01],
       [ 2.7133e+01,  2.5821e+01,  1.3540e+00],
       [ 2.8296e+01,  2.6384e+01,  1.3730e+00],
       [ 2.6517e+01,  2.5828e+01,  2.6120e+00],
       [ 2.7359e+01,  2.6454e+01,  3.4960e+00],
       [ 2.7074e+01,  2.6669e+01,  4.6990e+00],
       [ 2.8524e+01,  2.6818e+01,  2.7050e+00],
       [ 2.5195e+01,  2.5430e+01,  2.9570e+00],
       [ 2.9600e+01,  2.7548e+01,  3.2020e+00],
       [ 3.0755e+01,  2.8041e+01,  2.5560e+00],
       [ 3.1063e+01,  2.7844e+01,  1.1770e+00],
       [ 3.1639e+01,  2.8809e+01,  3.2810e+00],
       [ 3.2189e+01,  2.8334e+01,  6.3000e-01],
       [ 3.2768e+01,  2.9371e+01,  2.7080e+00],
       [ 3.3097e+01,  2.9125e+01,  1.3680e+00],
       [ 3.4211e+01,  2.9594e+01,  8.2700e-01],
       [ 2.5090e+01,  2.4076e+01,  3.6480e+00],
       [ 2.4124e+01,  2.3864e+01,  4.427

In [44]:
chrom_centroid = np.mean(chrom_coords, axis=0)
chrom_centroid

array([28.781826 , 26.768589 ,  1.6456766], dtype=float32)

In [45]:
def get_coord(x):
    try:
        return x['CA'].get_coord()
    except KeyError:
        if len(x) > 1:
            print("Setting one to the centroid")
            return chrom_centroid
        else:
            print("Found one degenerate atom")
            return np.nan

In [46]:
# Set coord to the CA coord (will fix the chromophore later)
all_residues['coord'] = all_residues['obj'].map(get_coord)
all_residues

Setting one to the centroid
Found one degenerate atom


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,1,2,obj,aa,coord
0,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",K,"[27.02, 8.242, 15.493]"
1,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",G,"[26.167, 10.97, 12.931]"
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E,"[29.631, 10.956, 11.452]"
3,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E,"[29.066, 7.359, 10.277]"
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",L,"[26.287, 8.517, 7.884]"
5,PHE,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",F,"[28.984, 10.302, 5.854]"
6,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T,"[31.488, 7.478, 5.112]"
7,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",G,"[30.325, 7.175, 1.539]"
8,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[28.861, 9.284, -1.231]"
9,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[25.613, 10.895, -0.073]"


In [47]:
gfp_structure_data

Unnamed: 0,dssp_index,aa,ss,rel_ASA,ASA
0,1,K,-,0.687805,141.0
1,2,G,G,0.059524,5.0
2,3,E,G,0.247423,48.0
3,4,E,G,0.711340,138.0
4,5,L,G,0.469512,77.0
5,6,F,G,0.020305,4.0
6,7,T,S,0.711268,101.0
7,8,G,S,0.392857,33.0
8,9,V,-,0.528169,75.0
9,10,V,E,0.070423,10.0


In [48]:
# Looks like the last glycine is degenerat, ditch it
all_residues = all_residues.iloc[:-1,:]
all_residues

Unnamed: 0,1,2,obj,aa,coord
0,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",K,"[27.02, 8.242, 15.493]"
1,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",G,"[26.167, 10.97, 12.931]"
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E,"[29.631, 10.956, 11.452]"
3,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E,"[29.066, 7.359, 10.277]"
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",L,"[26.287, 8.517, 7.884]"
5,PHE,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",F,"[28.984, 10.302, 5.854]"
6,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T,"[31.488, 7.478, 5.112]"
7,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",G,"[30.325, 7.175, 1.539]"
8,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[28.861, 9.284, -1.231]"
9,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[25.613, 10.895, -0.073]"


In [49]:
all_residues[all_residues.isnull().any(axis=1)]

Unnamed: 0,1,2,obj,aa,coord


In [50]:
all_residues['d_from_active'] = all_residues.coord.map(lambda x: np.linalg.norm(chrom_centroid - x))
all_residues

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,1,2,obj,aa,coord,d_from_active
0,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",K,"[27.02, 8.242, 15.493]",23.196699
1,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",G,"[26.167, 10.97, 12.931]",19.590591
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E,"[29.631, 10.956, 11.452]",18.625870
3,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E,"[29.066, 7.359, 10.277]",21.244120
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",L,"[26.287, 8.517, 7.884]",19.448942
5,PHE,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",F,"[28.984, 10.302, 5.854]",16.997042
6,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T,"[31.488, 7.478, 5.112]",19.785490
7,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",G,"[30.325, 7.175, 1.539]",19.654554
8,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[28.861, 9.284, -1.231]",17.719830
9,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[25.613, 10.895, -0.073]",16.277781


In [51]:
all_residues[all_residues.isnull().any(axis=1)]

Unnamed: 0,1,2,obj,aa,coord,d_from_active


In [52]:
all_residues.iloc[55:68,:]

Unnamed: 0,1,2,obj,aa,coord,d_from_active
55,PRO,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",P,"[28.458, 33.815, -6.377]",10.6827
56,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T,"[26.373, 35.692, -3.814]",10.734887
57,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",L,"[24.399, 32.48, -3.129]",8.638686
58,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[27.235, 29.969, -2.8]",5.692042
59,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T,"[27.489, 30.07, 0.977]",3.608024
60,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T,"[23.729, 29.902, 1.428]",5.949512
61,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",L,"[23.215, 26.996, -0.937]",6.140967
62,GYS,het=H_GYS,"(<Atom N>, <Atom OG1>, <Atom CB1>, <Atom CA1>,...",X,"[28.781826, 26.768589, 1.6456766]",0.0
63,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[25.95, 21.793, 3.819]",6.123648
64,GLN,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",Q,"[29.01, 22.067, 6.038]",6.438129


In [53]:
len(gfp_structure_data)

226

In [54]:
len(all_residues)

227

In [55]:
# there is one difference. Find it.
Levenshtein.editops("".join(gfp_structure_data.aa), "".join(all_residues.aa))

[('insert', 62, 62)]

In [56]:
gfp_structure_data.iloc[60:65,:]

Unnamed: 0,dssp_index,aa,ss,rel_ASA,ASA
60,61,T,T,0.028169,4.0
61,62,L,-,0.042683,7.0
62,64,V,-,0.133803,19.0
63,65,Q,G,0.126263,25.0
64,66,C,G,0.0,0.0


In [57]:
all_residues.iloc[60:65,:]

Unnamed: 0,1,2,obj,aa,coord,d_from_active
60,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T,"[23.729, 29.902, 1.428]",5.949512
61,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",L,"[23.215, 26.996, -0.937]",6.140967
62,GYS,het=H_GYS,"(<Atom N>, <Atom OG1>, <Atom CB1>, <Atom CA1>,...",X,"[28.781826, 26.768589, 1.6456766]",0.0
63,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[25.95, 21.793, 3.819]",6.123648
64,GLN,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",Q,"[29.01, 22.067, 6.038]",6.438129


In [58]:
# delete the chromophore to make these match
all_residues = all_residues.drop([62])
all_residues

Unnamed: 0,1,2,obj,aa,coord,d_from_active
0,LYS,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",K,"[27.02, 8.242, 15.493]",23.196699
1,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",G,"[26.167, 10.97, 12.931]",19.590591
2,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E,"[29.631, 10.956, 11.452]",18.625870
3,GLU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",E,"[29.066, 7.359, 10.277]",21.244120
4,LEU,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",L,"[26.287, 8.517, 7.884]",19.448942
5,PHE,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",F,"[28.984, 10.302, 5.854]",16.997042
6,THR,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",T,"[31.488, 7.478, 5.112]",19.785490
7,GLY,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",G,"[30.325, 7.175, 1.539]",19.654554
8,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[28.861, 9.284, -1.231]",17.719830
9,VAL,het=,"(<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Ato...",V,"[25.613, 10.895, -0.073]",16.277781


In [59]:
all_residues[all_residues.isnull().any(axis=1)]

Unnamed: 0,1,2,obj,aa,coord,d_from_active


In [60]:
# there is one difference. Find it.
Levenshtein.editops("".join(gfp_structure_data.aa), "".join(all_residues.aa))

[]

In [61]:
gfp_structure_data.aa.values == all_residues.aa.values

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [62]:
gfp_structure_data[gfp_structure_data.isnull().any(axis=1)]

Unnamed: 0,dssp_index,aa,ss,rel_ASA,ASA


In [63]:
gfp_structure_data['d_from_active'] = all_residues['d_from_active'].values
gfp_structure_data['coord'] = all_residues['coord'].values
gfp_structure_data

Unnamed: 0,dssp_index,aa,ss,rel_ASA,ASA,d_from_active,coord
0,1,K,-,0.687805,141.0,23.196699,"[27.02, 8.242, 15.493]"
1,2,G,G,0.059524,5.0,19.590591,"[26.167, 10.97, 12.931]"
2,3,E,G,0.247423,48.0,18.625870,"[29.631, 10.956, 11.452]"
3,4,E,G,0.711340,138.0,21.244120,"[29.066, 7.359, 10.277]"
4,5,L,G,0.469512,77.0,19.448942,"[26.287, 8.517, 7.884]"
5,6,F,G,0.020305,4.0,16.997042,"[28.984, 10.302, 5.854]"
6,7,T,S,0.711268,101.0,19.785490,"[31.488, 7.478, 5.112]"
7,8,G,S,0.392857,33.0,19.654554,"[30.325, 7.175, 1.539]"
8,9,V,-,0.528169,75.0,17.719830,"[28.861, 9.284, -1.231]"
9,10,V,E,0.070423,10.0,16.277781,"[25.613, 10.895, -0.073]"


In [64]:
all_residues[all_residues.isnull().any(axis=1)]

Unnamed: 0,1,2,obj,aa,coord,d_from_active


In [65]:
gfp_structure_data[gfp_structure_data.isnull().any(axis=1)]

Unnamed: 0,dssp_index,aa,ss,rel_ASA,ASA,d_from_active,coord


In [66]:
gfp_structure_data.to_csv('gfp_structure_info.csv')

In [67]:
for i,c in enumerate(gfp_structure_data.coord):
    print(i)
    print(c)

0
[27.02   8.242 15.493]
1
[26.167 10.97  12.931]
2
[29.631 10.956 11.452]
3
[29.066  7.359 10.277]
4
[26.287  8.517  7.884]
5
[28.984 10.302  5.854]
6
[31.488  7.478  5.112]
7
[30.325  7.175  1.539]
8
[28.861  9.284 -1.231]
9
[25.613 10.895 -0.073]
10
[23.024 12.477 -2.403]
11
[22.266 16.095 -1.78 ]
12
[19.1   18.152 -2.512]
13
[18.948 21.902 -1.995]
14
[15.797 24.043 -2.238]
15
[15.814 27.799 -1.804]
16
[13.085 30.426 -2.001]
17
[14.293 34.004 -2.053]
18
[13.302 37.621 -2.198]
19
[16.05  40.226 -2.822]
20
[14.774 43.806 -3.012]
21
[11.4   42.298 -4.045]
22
[12.818 40.116 -6.798]
23
[11.34  36.674 -6.165]
24
[13.281 33.572 -7.171]
25
[13.727 29.925 -6.407]
26
[16.733 27.646 -6.78 ]
27
[16.873 23.858 -6.784]
28
[20.13  21.954 -6.461]
29
[21.13  18.35  -6.827]
30
[24.472 16.586 -6.437]
31
[26.543 14.519 -4.098]
32
[28.92  14.826 -1.211]
33
[31.764 12.703  0.034]
34
[32.87  13.986  3.42 ]
35
[35.71  11.444  3.5  ]
36
[37.386 13.791  0.948]
37
[35.761 17.004  2.105]
38
[34.088 17.126 -1.2

In [68]:
gfp_structure_data[gfp_structure_data.isnull().any(axis=1)]

Unnamed: 0,dssp_index,aa,ss,rel_ASA,ASA,d_from_active,coord
