# Table of Contents
 <p><div class="lev1"><a href="#Summary"><span class="toc-item-num">1&nbsp;&nbsp;</span>Summary</a></div><div class="lev2"><a href="#References"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>References</a></div><div class="lev1"><a href="#Imports"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></div><div class="lev1"><a href="#Download"><span class="toc-item-num">3&nbsp;&nbsp;</span>Download</a></div><div class="lev1"><a href="#Load-data"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load data</a></div><div class="lev2"><a href="#PDB-chain-/-mutation-(DF1)"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>PDB chain / mutation (DF1)</a></div><div class="lev2"><a href="#UniProt-info-(DF2)"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>UniProt info (DF2)</a></div><div class="lev2"><a href="#Pfam-clan-(DF3)"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Pfam clan (DF3)</a></div><div class="lev2"><a href="#Partner-chain-(DF4)"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Partner chain (DF4)</a></div><div class="lev2"><a href="#Summary"><span class="toc-item-num">4.5&nbsp;&nbsp;</span>Summary</a></div><div class="lev1"><a href="#Save-to-database"><span class="toc-item-num">5&nbsp;&nbsp;</span>Save to database</a></div>

# Summary

- `partner_pdb_chain` is null if there are multiple mutations affecting both chains (this affects **376 rows**). This may have to be adjusted in ELASPIC...

## References

*Predicting free energy changes using structural ensembles*. Alexander Benedix, Caroline M Becker, Bert L de Groot, Amedeo Caflisch & Rainer A Böckmann. Nature Methods 6, 3 - 4 (2009) 
doi: [10.1038/nmeth0109-3](http://doi.org/10.1038/nmeth0109-3)


----

# Imports

In [1]:
%run imports.ipynb

2016-08-23 19:23:18.482217


In [2]:
%run mysqld.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2016-08-23 19:23:18.684688


Starting MySQL database...


In [3]:
NOTEBOOK_NAME = 'benedix_et_al'
os.environ['NOTEBOOK_NAME'] = NOTEBOOK_NAME
os.makedirs(NOTEBOOK_NAME, exist_ok=True)

# Download

In [4]:
INPUT_URL = (
    "http://bleoberis.bioc.cam.ac.uk/mcsm_ab/static/datasets/blind_test_228mutations.csv"
)
INPUT_FILE = op.join(NOTEBOOK_NAME, 'blind_test_228mutations.csv')

In [5]:
%%bash -s $INPUT_URL $INPUT_FILE
set -ev
INPUT_URL="$1"
INPUT_FILE="$2"

if [[ ! -e "$INPUT_FILE" ]] ; then
    wget -c --no-verbose "$INPUT_URL" -O "$INPUT_FILE"
    # git lfs track "./$INPUT_FILE"
    git add -f "./$INPUT_FILE" 
fi

INPUT_URL="$1"
INPUT_FILE="$2"

if [[ ! -e "$INPUT_FILE" ]] ; then
    wget -c --no-verbose "$INPUT_URL" -O "$INPUT_FILE"
    # git lfs track "./$INPUT_FILE"
    git add -f "./$INPUT_FILE" 
fi
2016-08-23 19:24:56 URL:http://bleoberis.bioc.cam.ac.uk/mcsm_ab/static/datasets/blind_test_228mutations.csv [9591/9591] -> "benedix_et_al/blind_test_228mutations.csv" [1]


# Load data

In [43]:
DF = pd.read_csv(INPUT_FILE, sep='\t')

In [44]:
DF.head()

Unnamed: 0,Mutation type,PDB,Mutation,Chain,Predicted,Experimental
0,Original,1JRH.pdb,E27A,L,-2.021,-0.54
1,Original,1JRH.pdb,D28A,L,-1.228,-0.44
2,Original,1JRH.pdb,Y30A,L,-1.8,-1.1
3,Original,1JRH.pdb,K47A,I,-0.309,-3.6
4,Original,1JRH.pdb,Y32A,H,0.322,-1.4


In [45]:
DF = DF[DF['Mutation type'] != 'Reverse']

In [46]:
DF[['PDB', 'Mutation', 'Chain']].drop_duplicates().shape

(114, 3)

## PDB chain / mutation (DF1)

In [47]:
DF['pdb_id'] = DF['PDB'].str.replace('.pdb', '').str.lower()
DF['pdb_chain'] = DF['Chain']
DF['partner_pdb_chain'] = np.nan
DF['pdb_mutation'] = DF['Chain'] + '_' + DF['Mutation']
DF['ddg_exp'] = DF['Experimental']

In [48]:
DF.head()
DF.shape[0]

114

In [49]:
DF1_bak = DF.copy()

## UniProt info (DF2)

In [50]:
DF = DF1_bak.copy()

In [51]:
pdb_ids = sorted(set(DF['pdb_id']))
print(pdb_ids)
print(len(pdb_ids))

['1ahw', '1dvf', '1jrh', '1vfb', '3hfm']
5


In [52]:
sifts_dfs = {
    pdb_id: ascommon.pdb_tools.sifts.get_sifts_data(pdb_id)
    for pdb_id in pdb_ids
}

In [53]:
sifts_dfs['1jrh'].head()

Unnamed: 0,comments,is_observed,pdb_aa,pdb_chain,pdb_id,pfam_id,resnum,uniprot_aa,uniprot_id,uniprot_position,residx
0,"T,loop",True,S,L,1jrh,,1,,,,1
1,"T,loop",True,V,L,1jrh,,2,,,,2
2,"T,loop",True,E,L,1jrh,,3,,,,3
3,"E,strand",True,M,L,1jrh,,4,,,,4
4,"E,strand",True,T,L,1jrh,,5,,,,5


In [54]:
assert not any(df is None for df in sifts_dfs.values())

In [55]:
reload(kmtools)
reload(kmtools.pdb_tools.sifts)

<module 'kmtools.pdb_tools.sifts' from '/home/kimlab1/strokach/working/kmtools/kmtools/pdb_tools/sifts.py'>

In [56]:
# Add info to core
def get_sifts_data(pdb_id, pdb_chain, pdb_mutation):
    sifts_df = sifts_dfs[pdb_id]
    try:
        result = kmtools.pdb_tools.sifts.convert_pdb_mutations_to_uniprot(
            pdb_id, pdb_chain, pdb_mutation, sifts_df)
    except kmtools.pdb_tools.sifts.SIFTSError as e:
        logger.error(e)
        result = {}
    return result

results = [get_sifts_data(*x) for x in DF[['pdb_id', 'pdb_chain', 'pdb_mutation']].dropna().values]
results_df = pd.DataFrame(results)

No mutation mapping available! (1jrh, L_E27A, L, E27A):
    [{'pdb_chain': 'L', 'uniprot_id': nan, 'pfam_id': nan, 'uniprot_pos': nan, 'uniprot_aa': nan}]
No mutation mapping available! (1jrh, L_D28A, L, D28A):
    [{'pdb_chain': 'L', 'uniprot_id': nan, 'pfam_id': nan, 'uniprot_pos': nan, 'uniprot_aa': nan}]
No mutation mapping available! (1jrh, L_Y30A, L, Y30A):
    [{'pdb_chain': 'L', 'uniprot_id': nan, 'pfam_id': nan, 'uniprot_pos': nan, 'uniprot_aa': nan}]
No mutation mapping available! (1jrh, H_Y32A, H, Y32A):
    [{'pdb_chain': 'H', 'uniprot_id': nan, 'pfam_id': nan, 'uniprot_pos': nan, 'uniprot_aa': nan}]
No mutation mapping available! (1jrh, H_W53A, H, W53A):
    [{'pdb_chain': 'H', 'uniprot_id': nan, 'pfam_id': nan, 'uniprot_pos': nan, 'uniprot_aa': nan}]
No mutation mapping available! (1jrh, H_D54A, H, D54A):
    [{'pdb_chain': 'H', 'uniprot_id': nan, 'pfam_id': nan, 'uniprot_pos': nan, 'uniprot_aa': nan}]
No mutation mapping available! (1jrh, H_D56A, H, D56A):
    [{'pdb_cha

In [57]:
_before = DF.shape[0]
DF = DF.join(
    results_df.rename(
        columns=lambda x: x.partition('_sifts')[0].replace('mutations', 'mutation')))
assert _before == DF.shape[0]

In [58]:
DF.head(5)
DF.shape[0]

114

In [59]:
DF2_bak = DF.copy()

## Pfam clan (DF3)

In [60]:
DF = DF2_bak.copy()

In [61]:
pfam_a_clans = (
    pd.read_sql_table('pfam_a_clans', db_remote.engine, schema='pfam')
)

In [62]:
pfam_a_clans.head()

Unnamed: 0,pfam_id,clan_id,clan_name,pfam_name,pfam_description
0,PF00389,CL0325,Form_Glyc_dh,2-Hacid_dh,"D-isomer specific 2-hydroxyacid dehydrogenase,..."
1,PF00198,CL0149,CoA-acyltrans,2-oxoacid_dh,2-oxoacid dehydrogenases acyltransferase (cata...
2,PF04029,,,2-ph_phosp,2-phosphosulpholactate phosphatase
3,PF03171,CL0029,Cupin,2OG-FeII_Oxy,2OG-Fe(II) oxygenase superfamily
4,PF01073,CL0063,NADP_Rossmann,3Beta_HSD,3-beta hydroxysteroid dehydrogenase/isomerase ...


In [63]:
DF['pfam_clan'] = DF['pfam_id'].map(pfam_a_clans.set_index('pfam_id')['clan_id'])

In [64]:
DF['pfam_clan'].notnull().sum()

69

In [65]:
DF3_bak = DF.copy()

## Partner chain (DF4)

In [66]:
DF = DF3_bak.copy()

In [67]:
def get_partner_uniprot(partner_chian, sifts_df):
    sifts_df = sifts_df[sifts_df['pdb_chain'] == partner_chian]
    partner_uniprot_ids = sifts_df['uniprot_id'].dropna().drop_duplicates().tolist()
    if len(partner_uniprot_ids) == 0:
        return np.nan
    elif len(partner_uniprot_ids) == 1:
        return partner_uniprot_ids[0]
    else:
        raise Exception(partner_uniprot_ids)

In [69]:
sifts_dfs['1jrh'].head()

Unnamed: 0,comments,is_observed,pdb_aa,pdb_chain,pdb_id,pfam_id,resnum,uniprot_aa,uniprot_id,uniprot_position,residx
0,"T,loop",True,S,L,1jrh,,1,,,,1
1,"T,loop",True,V,L,1jrh,,2,,,,2
2,"T,loop",True,E,L,1jrh,,3,,,,3
3,"E,strand",True,M,L,1jrh,,4,,,,4
4,"E,strand",True,T,L,1jrh,,5,,,,5


In [73]:
get_partner_uniprot('L', sifts_dfs['1jrh'])

'P01837'

In [74]:
DF['partner_uniprot_id'] = [
    get_partner_uniprot(partner_chain, sifts_dfs[pdb_id])
    for pdb_id, partner_chain
    in DF[['pdb_id', 'partner_pdb_chain']].values
]

In [75]:
DF4_bak = DF.copy()

## Summary

In [76]:
print2("Number of rows:", DF.shape[0])
print('-' * 80)

print2("Number of missing uniprots:", DF['uniprot_id'].isnull().sum())
print2("Number of missing mutations:", DF['uniprot_mutation'].isnull().sum())
print2("Number of missing uniprots mutations:", 
       DF[['uniprot_id', 'uniprot_mutation']].isnull().any(axis=1).sum())
print('-' * 80)

print2("Number of missing partner uniprots:", DF['partner_uniprot_id'].isnull().sum())
print2("Number of missing partner uniprot mutations:", 
       DF[['uniprot_id', 'partner_uniprot_id', 'uniprot_mutation']].isnull().any(axis=1).sum())
print('-' * 80)

print2("Number of missing pfams:", DF['pfam_id'].isnull().sum())

Number of rows:                                             114
--------------------------------------------------------------------------------
Number of missing uniprots:                                 40
Number of missing mutations:                                40
Number of missing uniprots mutations:                       40
--------------------------------------------------------------------------------
Number of missing partner uniprots:                         114
Number of missing partner uniprot mutations:                114
--------------------------------------------------------------------------------
Number of missing pfams:                                    45


# Save to database

In [77]:
DF = DF4_bak.copy()

In [78]:
DF.head()

Unnamed: 0,Mutation type,PDB,Mutation,Chain,Predicted,Experimental,pdb_id,pdb_chain,partner_pdb_chain,pdb_mutation,ddg_exp,pfam_id,uniprot_id,uniprot_mutation,pfam_clan,partner_uniprot_id
0,Original,1JRH.pdb,E27A,L,-2.021,-0.54,1jrh,L,,L_E27A,-0.54,,,,,
1,Original,1JRH.pdb,D28A,L,-1.228,-0.44,1jrh,L,,L_D28A,-0.44,,,,,
2,Original,1JRH.pdb,Y30A,L,-1.8,-1.1,1jrh,L,,L_Y30A,-1.1,,,,,
3,Original,1JRH.pdb,K47A,I,-0.309,-3.6,1jrh,I,,I_K47A,-3.6,PF01108,P15260,K64A,CL0159,
4,Original,1JRH.pdb,Y32A,H,0.322,-1.4,1jrh,H,,H_Y32A,-1.4,,,,,


In [79]:
columns = [
    'uniprot_id', 'partner_uniprot_id', 'uniprot_mutation',
    'pdb_id', 'pdb_chain', 'partner_pdb_chain', 'pdb_mutation',
    
]

In [80]:
DF[columns].head()

Unnamed: 0,uniprot_id,partner_uniprot_id,uniprot_mutation,pdb_id,pdb_chain,partner_pdb_chain,pdb_mutation
0,,,,1jrh,L,,L_E27A
1,,,,1jrh,L,,L_D28A
2,,,,1jrh,L,,L_Y30A
3,P15260,,K64A,1jrh,I,,I_K47A
4,,,,1jrh,H,,H_Y32A


In [90]:
DF[DF['partner_pdb_chain'].isnull()].shape

(114, 16)

In [103]:
groupby_columns = [
    'pdb_id', 'pdb_chain', 'pdb_mutation',
]
extra_columns = [
    'partner_pdb_chain',
    'uniprot_id', 'partner_uniprot_id', 'uniprot_mutation', 'pfam_id', 'pfam_clan',
]
data_columns = [
    'ddg_exp'
]

In [104]:
# Average over duplicate mutations
df = (
    DF
    # .dropna(subset=['pdb_id', 'pdb_chain', 'partner_pdb_chain', 'pdb_mutation'])
    .groupby(groupby_columns)
    .agg({**{c: lambda x: x.iloc[0] for c in extra_columns}, **{c: 'mean' for c in data_columns}})
    .reset_index()
)

In [106]:
print2('Unique mutations affecting only 1 chain:', df.shape[0])

Unique mutations affecting only 1 chain:                    114


In [107]:
t = db.import_df(
    df[groupby_columns + extra_columns + data_columns], 
    NOTEBOOK_NAME)

Loading data into MySQL table: 'benedix_et_al'...
Running locally
Command ran successfully:



In [108]:
t.create_indexes([
    (['pdb_id', 'pdb_chain', 'partner_pdb_chain', 'pdb_mutation'], True),
    (['uniprot_id', 'partner_uniprot_id', 'uniprot_mutation'], False),
])

In [109]:
t.add_idx_column()

114

In [110]:
!ls -lh /home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/ab_bind.*

-rw-rw---- 1 strokach kimlab 2.9K Aug 21 20:49 /home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/ab_bind.frm
-rw-rw---- 1 strokach kimlab  18K Aug 21 20:49 /home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/ab_bind.MYD
-rw-rw---- 1 strokach kimlab  31K Aug 21 20:49 /home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/ab_bind.MYI


In [111]:
t.compress()

system_command: 'myisampack --no-defaults '/home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/benedix_et_al.MYI''
Remember to run myisamchk -rq on compressed tables
system_command: 'myisamchk -rq '/home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/benedix_et_al.MYI''
- check record delete-chain
- recovering (with sort) MyISAM-table '/home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/benedix_et_al.MYI'
Data records: 114
- Fixing index 1
- Fixing index 2
- Fixing index 3
File size before: 0.01 MB
File size after: 0.00 MB
File size savings: 0.00 MB (53.92 %)


(CompletedProcess(args=['myisampack', '--no-defaults', '/home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/benedix_et_al.MYI'], returncode=0, stdout='Remember to run myisamchk -rq on compressed tables\n', stderr=''),
 CompletedProcess(args=['myisamchk', '-rq', '/home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/benedix_et_al.MYI'], returncode=0, stdout="- check record delete-chain\n- recovering (with sort) MyISAM-table '/home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/benedix_et_al.MYI'\nData records: 114\n- Fixing index 1\n- Fixing index 2\n- Fixing index 3\n", stderr=''))

In [112]:
!ls -lh /home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/ab_bind.*

-rw-rw---- 1 strokach kimlab 2.9K Aug 21 20:49 /home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/ab_bind.frm
-rw-rw---- 1 strokach kimlab  18K Aug 21 20:49 /home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/ab_bind.MYD
-rw-rw---- 1 strokach kimlab  31K Aug 21 20:49 /home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/ab_bind.MYI


In [113]:
print(datetime.datetime.now())

2016-08-23 19:45:01.315830
