# Table of Contents
 <p><div class="lev1"><a href="#Summary"><span class="toc-item-num">1&nbsp;&nbsp;</span>Summary</a></div><div class="lev1"><a href="#Imports"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></div><div class="lev1"><a href="#Download"><span class="toc-item-num">3&nbsp;&nbsp;</span>Download</a></div><div class="lev1"><a href="#Load-data"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load data</a></div><div class="lev2"><a href="#PDB-chain-/-mutation-(DF1)"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>PDB chain / mutation (DF1)</a></div><div class="lev2"><a href="#UniProt-info-(DF2)"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>UniProt info (DF2)</a></div><div class="lev2"><a href="#Pfam-clan-(DF3)"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Pfam clan (DF3)</a></div><div class="lev2"><a href="#Partner-chain-(DF4)"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Partner chain (DF4)</a></div><div class="lev2"><a href="#Summary"><span class="toc-item-num">4.5&nbsp;&nbsp;</span>Summary</a></div><div class="lev1"><a href="#Save-to-database"><span class="toc-item-num">5&nbsp;&nbsp;</span>Save to database</a></div>

# Summary


[Mechanism of Neutralization by the Broadly Neutralizing HIV-1 Monoclonal Antibody VRC01](http://doi.org/10.1128/JVI.00754-11)

----

# Imports

In [1]:
%run imports.ipynb

2016-08-24 02:29:46.255756


In [2]:
%run mysqld.ipynb

MySQL database already running...


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2016-08-24 02:29:46.422118


In [3]:
NOTEBOOK_NAME = 'hiv_escape_mutations'
os.environ['NOTEBOOK_NAME'] = NOTEBOOK_NAME
os.makedirs(NOTEBOOK_NAME, exist_ok=True)

# Download

Convert figures found [here](http://jvi.asm.org/content/85/17/8954/F1.expansion.html) to CSV files using [Online OCR](http://www.onlineocr.net/).

In [4]:
INPUT_FILE = op.abspath(op.join(NOTEBOOK_NAME, 'HIV_escape_mutations_from_pictures.csv'))

# Load data

In [5]:
DF = pd.read_csv(INPUT_FILE, sep='\t', names=['mutation', 'score'], na_values=['ND'])

In [6]:
DF.head()

Unnamed: 0,mutation,score
0,E87A,98.0
1,M95A,82.0
2,K97A,88.0
3,E102A,100.0
4,W112A,105.0


In [7]:
DF['mutation'].drop_duplicates().shape

(119,)

## PDB chain / mutation (DF1)

In [8]:
def fix_mutation(mutation):
    mutation_pos = int(mutation[1:-1])
    # if mutation_pos > 301 and mutation_pos < 350:
    mutation_pos = mutation_pos
    return mutation[0] + str(mutation_pos) + mutation[-1]

In [9]:
DF['pdb_id'] = '3ngb'
DF['pdb_chain'] = 'G'
DF['partner_pdb_chain'] = np.nan
# The provided mutations are refersed, don't know why...
DF['pdb_mutation'] = DF['pdb_chain'] + '_' + DF['mutation'].apply(fix_mutation)
DF['ddg_exp'] = np.log10(DF['score'])

In [10]:
display(DF.head())
print(DF.shape[0])

Unnamed: 0,mutation,score,pdb_id,pdb_chain,partner_pdb_chain,pdb_mutation,ddg_exp
0,E87A,98.0,3ngb,G,,G_E87A,1.991226
1,M95A,82.0,3ngb,G,,G_M95A,1.913814
2,K97A,88.0,3ngb,G,,G_K97A,1.944483
3,E102A,100.0,3ngb,G,,G_E102A,2.0
4,W112A,105.0,3ngb,G,,G_W112A,2.021189


119


In [11]:
DF1_bak = DF.copy()

## UniProt info (DF2)

In [12]:
DF = DF1_bak.copy()

In [13]:
pdb_id = '3ngb'

In [14]:
sifts_df = ascommon.pdb_tools.sifts.get_sifts_data(pdb_id)

In [15]:
sifts_df.head()

Unnamed: 0,comments,is_observed,pdb_aa,pdb_chain,pdb_id,pfam_id,resnum,uniprot_aa,uniprot_id,uniprot_position,residx
0,"T,loop",True,V,G,3ngb,PF00516,44,V,Q0ED31,43,1
1,"E,strand",True,W,G,3ngb,PF00516,45,W,Q0ED31,44,2
2,"E,strand",True,K,G,3ngb,PF00516,46,K,Q0ED31,45,3
3,"E,strand",True,D,G,3ngb,PF00516,47,D,Q0ED31,46,4
4,"T,loop",True,A,G,3ngb,PF00516,48,A,Q0ED31,47,5


In [16]:
sifts_df.dtypes

comments            object
is_observed           bool
pdb_aa              object
pdb_chain           object
pdb_id              object
pfam_id             object
resnum              object
uniprot_aa          object
uniprot_id          object
uniprot_position    object
residx               int64
dtype: object

In [17]:
def get_sifts_data(pdb_chain_mutations):
    pdb_chain, pdb_mutation = pdb_chain_mutations.split('_')
    row = (
        sifts_df[
            (sifts_df['pdb_chain'] == pdb_chain) &
            (sifts_df['pdb_aa'] == pdb_mutation[0]) &
            (sifts_df['resnum'] == pdb_mutation[1:-1])
        ]
    )
    if row.shape[0] == 0:
        print("Could not convert '{}'".format(pdb_chain_mutations))
        return np.nan, np.nan, np.nan
    elif row.shape[0] > 1:
        print("Too many rows returned!")
        print(row)
    row = row.iloc[0]
    if row['pdb_aa'] != row['uniprot_aa']:
        print("Warning! PDB and UniProt do not match!")
        print(row)
    uniprot_id = row['uniprot_id']
    uniprot_mutation = row['uniprot_aa'] + row['uniprot_position'] + pdb_mutation[-1]
    pfam_id = row['pfam_id']
    return uniprot_id, uniprot_mutation, pfam_id


# get_sifts_data('G_E87A')
assert get_sifts_data('G_E87A') == ('Q0ED31', 'E86A', 'PF00516')

In [18]:
DF.shape

(119, 7)

In [19]:
DF['uniprot_id'], DF['uniprot_mutation'], DF['pfam_id'] = list(zip(*(
    DF['pdb_mutation'].apply(get_sifts_data)
)))

Could not convert 'G_L125A'
Could not convert 'G_V127A'
Could not convert 'G_N156A'
Could not convert 'G_N160K'
Could not convert 'G_T162A'
Could not convert 'G_I165K'
Could not convert 'G_I165A'
Could not convert 'G_R166A'
Could not convert 'G_D167N'
Could not convert 'G_K171A'
Could not convert 'G_E172A'
Could not convert 'G_F176A'
Could not convert 'G_Y177A'
Could not convert 'G_L179A'
Could not convert 'G_D180A'
Could not convert 'G_V182A'
Could not convert 'G_I184A'
Could not convert 'G_D185A'
Could not convert 'G_T190A'
Could not convert 'G_N197A'
Could not convert 'G_N197K'
Could not convert 'G_N197T'
Could not convert 'G_T198A'
Could not convert 'G_T202A'
Could not convert 'G_R252A'
Could not convert 'G_R253A'
Could not convert 'G_D279A'
Could not convert 'G_N302A'
Could not convert 'G_R304A'
Could not convert 'G_K305A'
Could not convert 'G_S306A'
Could not convert 'G_I307A'
Could not convert 'G_H308A'
Could not convert 'G_I309A'
Could not convert 'G_P313A'
Could not convert 'G

In [20]:
DF.dropna(subset=['pfam_id']).shape

(57, 10)

In [21]:
DF2_bak = DF.copy()

## Pfam clan (DF3)

In [22]:
DF = DF2_bak.copy()

In [23]:
pfam_a_clans = (
    pd.read_sql_table('pfam_a_clans', db_remote.engine, schema='pfam')
)

In [24]:
pfam_a_clans.head()

Unnamed: 0,pfam_id,clan_id,clan_name,pfam_name,pfam_description
0,PF00389,CL0325,Form_Glyc_dh,2-Hacid_dh,"D-isomer specific 2-hydroxyacid dehydrogenase,..."
1,PF00198,CL0149,CoA-acyltrans,2-oxoacid_dh,2-oxoacid dehydrogenases acyltransferase (cata...
2,PF04029,,,2-ph_phosp,2-phosphosulpholactate phosphatase
3,PF03171,CL0029,Cupin,2OG-FeII_Oxy,2OG-Fe(II) oxygenase superfamily
4,PF01073,CL0063,NADP_Rossmann,3Beta_HSD,3-beta hydroxysteroid dehydrogenase/isomerase ...


In [25]:
pfam_a_clans[pfam_a_clans['pfam_id'] == 'PF00516']

Unnamed: 0,pfam_id,clan_id,clan_name,pfam_name,pfam_description
2669,PF00516,,,GP120,Envelope glycoprotein GP120


In [26]:
DF['pfam_clan'] = DF['pfam_id'].map(pfam_a_clans.set_index('pfam_id')['clan_id'])

In [27]:
DF.head()

Unnamed: 0,mutation,score,pdb_id,pdb_chain,partner_pdb_chain,pdb_mutation,ddg_exp,uniprot_id,uniprot_mutation,pfam_id,pfam_clan
0,E87A,98.0,3ngb,G,,G_E87A,1.991226,Q0ED31,E86A,PF00516,
1,M95A,82.0,3ngb,G,,G_M95A,1.913814,Q0ED31,M94A,PF00516,
2,K97A,88.0,3ngb,G,,G_K97A,1.944483,Q0ED31,K96A,PF00516,
3,E102A,100.0,3ngb,G,,G_E102A,2.0,Q0ED31,E101A,PF00516,
4,W112A,105.0,3ngb,G,,G_W112A,2.021189,Q0ED31,W111A,PF00516,


In [28]:
DF['pfam_clan'].notnull().sum()

0

In [29]:
DF3_bak = DF.copy()

## Partner chain (DF4)

In [30]:
DF = DF3_bak.copy()

In [31]:
def get_partner_uniprot(partner_chian, sifts_df):
    sifts_df = sifts_df[sifts_df['pdb_chain'] == partner_chian]
    partner_uniprot_ids = sifts_df['uniprot_id'].dropna().drop_duplicates().tolist()
    if len(partner_uniprot_ids) == 0:
        return np.nan
    elif len(partner_uniprot_ids) == 1:
        return partner_uniprot_ids[0]
    else:
        raise Exception(partner_uniprot_ids)

In [32]:
sifts_dfs['3ngb'].head()

NameError: name 'sifts_dfs' is not defined

In [None]:
get_partner_uniprot('A', sifts_dfs['3ngb'])

In [None]:
DF['partner_uniprot_id'] = [
    get_partner_uniprot(partner_chain, sifts_dfs[pdb_id])
    for pdb_id, partner_chain
    in DF[['pdb_id', 'partner_pdb_chain']].values
]

In [None]:
DF4_bak = DF.copy()

## Summary

In [None]:
print2("Number of rows:", DF.shape[0])
print('-' * 80)

print2("Number of missing uniprots:", DF['uniprot_id'].isnull().sum())
print2("Number of missing mutations:", DF['uniprot_mutation'].isnull().sum())
print2("Number of missing uniprots mutations:", 
       DF[['uniprot_id', 'uniprot_mutation']].isnull().any(axis=1).sum())
print('-' * 80)

print2("Number of missing partner uniprots:", DF['partner_uniprot_id'].isnull().sum())
print2("Number of missing partner uniprot mutations:", 
       DF[['uniprot_id', 'partner_uniprot_id', 'uniprot_mutation']].isnull().any(axis=1).sum())
print('-' * 80)

print2("Number of missing pfams:", DF['pfam_id'].isnull().sum())

# Save to database

In [None]:
DF = DF4_bak.copy()

In [None]:
DF.head()

In [None]:
columns = [
    'uniprot_id', 'partner_uniprot_id', 'uniprot_mutation',
    'pdb_id', 'pdb_chain', 'partner_pdb_chain', 'pdb_mutation',
    
]

In [None]:
DF[columns].head()

In [None]:
DF[DF['partner_pdb_chain'].isnull()].shape

In [None]:
groupby_columns = [
    'pdb_id', 'pdb_chain', 'pdb_mutation',
]
extra_columns = [
    'partner_pdb_chain',
    'uniprot_id', 'partner_uniprot_id', 'uniprot_mutation', 'pfam_id', 'pfam_clan',
]
data_columns = [
    'ddg_exp'
]

In [None]:
# Average over duplicate mutations
df = (
    DF
    # .dropna(subset=['pdb_id', 'pdb_chain', 'partner_pdb_chain', 'pdb_mutation'])
    .groupby(groupby_columns)
    .agg({**{c: lambda x: x.iloc[0] for c in extra_columns}, **{c: 'mean' for c in data_columns}})
    .reset_index()
)

In [None]:
print2('Unique mutations affecting only 1 chain:', df.shape[0])

In [None]:
t = db.import_df(
    df[groupby_columns + extra_columns + data_columns], 
    NOTEBOOK_NAME)

In [None]:
t.create_indexes([
    (['pdb_id', 'pdb_chain', 'partner_pdb_chain', 'pdb_mutation'], True),
    (['uniprot_id', 'partner_uniprot_id', 'uniprot_mutation'], False),
])

In [None]:
t.add_idx_column()

In [None]:
!ls -lh /home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/ab_bind.*

In [None]:
t.compress()

In [None]:
!ls -lh /home/kimlab1/database_data/biodb/recipes/protein_interaction_energy/notebooks/mysqld/protein_interaction_energy/ab_bind.*

In [None]:
print(datetime.datetime.now())