# Evaluate the predicted targets of Rck2 and Cdc28 - June 22, 2018

In [1]:
#%pylab inline

import os
#import seaborn as sns
import pandas as pd
import numpy as np

data_dir = os.path.join('..', '..', 'data')
eval_dir = os.path.join(data_dir, 'evaluation')

rck2_file = os.path.join(eval_dir, 'TableS2_Romanov.xlsx')
id_file = os.path.join(data_dir, 'SGD_id_map.txt')

## Load and format the Rck2 target data

Only load the following columns:
- **0**: Protein name
- **1**: PhosphoSite
- **16**: SETUP SR wt +/- 5min 0.5M NaCl Average L/H-ratio
- **71**: SETUP rck2 wt vs. rck2Δ + 5min 0.5M NaCl Average L/H-ratio

Have to skip the initial rows because the Δ doesn't parse correctly.

Romanov Table S6 defines the strains:
- **WR209**: W303, Mat a; lys1::kanMX arg4::kanMX CAN1 leu2-3,112 trp1-1 ura3-1 ade2-1 his3-11,15
- **MJ243**: W303, Mat a; rck2::HIS3 lys1::kanMX arg4::kanMX CAN1 leu2-3,112 trp1-1 ura3-1 ade2-1 his3-11,23

Romanov Table S8 defines the ratios:
- **SETUP SR wt +/- 5min 0.5M NaCl Average L/H-ratio**: 0.5M NaCl for 5mins / no stress
- **SETUP rck2 wt vs. rck2Δ + 5min 0.5M NaCl Average L/H-ratio**: MJ243 +5mins 0.5M NaCl / WR209 +5mins 0.5M NaCl

In [2]:
headers = ['Protein', 'Phosphosite', 'WT 5min L/H ratio', 'rck2 del 5min L/H ratio']
rck2_df = pd.read_excel(rck2_file, header=None, skiprows=[0,1,2], parse_cols=[0,1,16,71], names=headers)

# check the length and number of non-missing data points
assert len(rck2_df) == 16413
assert sum(np.isnan(rck2_df['WT 5min L/H ratio'])) == 8356
assert sum(np.isnan(rck2_df['rck2 del 5min L/H ratio'])) == 11579

# remove rows missing values in either condition
rck2_df = rck2_df.dropna()
assert len(rck2_df) == 2794

# spot check some proteins' values
assert rck2_df[(rck2_df['Protein'] == 'ABP140') & (rck2_df['Phosphosite'] == '326(S)')]['WT 5min L/H ratio'].values[0] == 1.21
assert rck2_df[(rck2_df['Protein'] == 'ABP140') & (rck2_df['Phosphosite'] == '326(S)')]['rck2 del 5min L/H ratio'].values[0] == 1.00

assert rck2_df[(rck2_df['Protein'] == 'ZUO1') & (rck2_df['Phosphosite'] == '50(S)')]['WT 5min L/H ratio'].values[0] == 0.87
assert rck2_df[(rck2_df['Protein'] == 'ZUO1') & (rck2_df['Phosphosite'] == '50(S)')]['rck2 del 5min L/H ratio'].values[0] == 1.01

## Filter the Rck2 target data
- Must have >= 2 fold change in wild type when stimulated with NaCl
- Must have (wild type response) - (rck2 del / wild type) >= 1.5 fold

We also manually noted that PIK1 396(S) has a phospho defect in rck2 deletion.  It significantly increases in phosphorylation in the Kanshin study, is not observed in the wild type condition in the Romanov study, and has (rck2 del)/(wild type)=0.29 in the Romanov study.

In [3]:
rck2_df = rck2_df[rck2_df['WT 5min L/H ratio'] >= 2]
assert len(rck2_df) == 408
rck2_df['Difference'] = rck2_df['WT 5min L/H ratio'] - rck2_df['rck2 del 5min L/H ratio']

rck2_df = rck2_df[rck2_df['Difference'] >= 1.5]
assert len(rck2_df) == 326

In [4]:
no_rck2_decrease = sum(rck2_df['rck2 del 5min L/H ratio'] >= 1.0)
print('{} "rck2 defective phosphosites" do not decrease in phosphorylation when rck2 is delected'.format(no_rck2_decrease))

72 "rck2 defective phosphosites" do not decrease in phosphorylation when rck2 is delected


## Format Rck2 target phosphosites
- Split di- and tri-phosphorylated peptides to match individual sites
- Map gene symbols to ORF identifiers

In [5]:
# Format the Romanov phosphosite to match the Kanshin phosphosite, including splitting sites
def format_site(site):
    site = site.rstrip(')')
    tokens = site.split('(')
    return tokens[1] + tokens[0]

def format_sites(sites):
    return map(format_site, sites.split(','))

In [6]:
# Test the site formatting
print(format_sites('312(S)'))
print(format_sites('233(T),236(S)'))

['S312']
['T233', 'S236']


In [7]:
id_df = pd.read_csv(id_file, sep='\t', header=None, names=['ORF', 'Symbol'])
assert len(id_df) == 8061
assert len(id_df['ORF'].unique()) == 8061
assert len(id_df['Symbol'].unique()) == 8061
id_map = dict(zip(id_df['Symbol'], id_df['ORF']))
# Make sure ORF ids can always map back to themselves
orf_orf_map = dict(zip(id_df['ORF'], id_df['ORF']))
id_map.update(orf_orf_map)

print('{} rck2 defective phosphosites before splitting di- and tri-phosphorylated'.format(len(rck2_df)))

rck2_def_sites = set()

for index, row in rck2_df.iterrows():
    for site in format_sites(row['Phosphosite']):
        orf = id_map[str(row['Protein'])]
        rck2_def_sites.add(orf + '::' + site)

print('{} rck2 defective phosphosites after splitting di- and tri-phosphorylated'.format(len(rck2_def_sites)))

326 rck2 defective phosphosites before splitting di- and tri-phosphorylated
341 rck2 defective phosphosites after splitting di- and tri-phosphorylated


In [8]:
rck2_df.to_csv('tmp.csv')

In [9]:
rck2_def_sites

{u'YAL031C::S462',
 u'YAL035W::S257',
 u'YAR002W::S222',
 u'YAR002W::T451',
 u'YBL007C::S799',
 u'YBL016W::Y182',
 u'YBL047C::S1191',
 u'YBL047C::S1194',
 u'YBL047C::S249',
 u'YBL047C::S931',
 u'YBL047C::T1307',
 u'YBL051C::T512',
 u'YBL054W::S341',
 u'YBL061C::S32',
 u'YBL085W::S104',
 u'YBL085W::S590',
 u'YBL085W::S648',
 u'YBL101C::S16',
 u'YBL103C::S269',
 u'YBL103C::T150',
 u'YBR007C::S205',
 u'YBR059C::S10',
 u'YBR059C::S12',
 u'YBR059C::S511',
 u'YBR068C::S3',
 u'YBR073W::S85',
 u'YBR086C::S729',
 u'YBR108W::S476',
 u'YBR108W::S915',
 u'YBR108W::T471',
 u'YBR172C::S84',
 u'YBR172C::S96',
 u'YBR172C::T129',
 u'YBR172C::T70',
 u'YBR200W::S461',
 u'YBR200W::T26',
 u'YBR215W::S128',
 u'YBR225W::S531',
 u'YBR273C::S388',
 u'YCL005W::S241',
 u'YCL014W::S1110',
 u'YCL024W::S841',
 u'YCL025C::S87',
 u'YCL029C::S163',
 u'YCL032W::T244',
 u'YDL122W::S670',
 u'YDL146W::S463',
 u'YDL146W::S466',
 u'YDL153C::S533',
 u'YDL173W::S36',
 u'YDL173W::S39',
 u'YDL193W::S60',
 u'YDL195W::S836',
 u'Y

## Log the Python environment

In [10]:
! conda list

# packages in environment at C:\Program Files\Anaconda:
#
_license                  1.1                      py27_0    <unknown>
anaconda                  2.1.0                np19py27_0    <unknown>
argcomplete               0.8.1                    py27_0    <unknown>
asn1crypto                0.22.0                   py27_0  
astropy                   0.4.2                np19py27_0    <unknown>
atom                      0.3.9                    py27_0    <unknown>
beautiful-soup            4.3.2                    py27_0    <unknown>
binstar                   0.7.1                    py27_0    <unknown>
bitarray                  0.8.1                    py27_1    <unknown>
blaze                     0.6.3                np19py27_0    <unknown>
blz                       0.6.2                np19py27_0    <unknown>
bokeh                     0.6.1                np19py27_0    <unknown>
boto                      2.32.1                   py27_0    <unknown>
brewer2mpl                1.4.