# Parse a simplified verison of the Kanshin2015 phosphorylation data

In [1]:
import pandas as pd
import numpy as np

## Load the file, check the structure of the simplified verison

In [2]:
phospho_df = pd.read_csv('simplified_phospho.tsv', sep='\t')
phospho_df.head()

Unnamed: 0,id,Modified Peptide Sequence,PhosphoGroups,Significance,pSite,T00,T05,T10,T15,T20
0,18075,NVVDENLINDMDS(ph)EDAHK,1,,S618,-0.041432,0.014212,0.064607,0.132248,0.131194
1,18058,ADDEEDLS(ph)DENIQPELR,1,,S720,-0.004523,-0.020311,0.022332,0.002595,0.031819
2,18078,SNS(ph)IDYAK,1,,S467,-0.095188,-0.039221,-0.043705,-0.042115,-0.07733
3,18077,QHLSDIT(ph)LEER,1,,T633,-0.054797,0.098554,0.041944,-0.105411,0.020769
4,18068,GLDDESGPTHGNDS(ph)GNHR,1,,S215,-0.113489,-0.100685,-0.020019,-0.034352,-0.081125


In [3]:
phospho_df.describe()

Unnamed: 0,id,PhosphoGroups,T00,T05,T10,T15,T20
count,14.0,14,14.0,14.0,14.0,14.0,13.0
mean,14025.0,1,-0.016738,0.036592,0.080097,0.059421,0.039683
std,4199.044836,0,0.090905,0.138889,0.174544,0.162434,0.142461
min,9948.0,1,-0.134129,-0.100685,-0.045327,-0.105411,-0.081125
25%,9983.5,1,-0.083508,-0.034494,-0.018282,-0.01058,-0.024663
50%,14025.5,1,-0.029994,0.014498,0.021693,0.020388,0.016924
75%,18072.25,1,0.02399,0.067527,0.063744,0.069619,0.031819
max,18078.0,1,0.221692,0.469573,0.556307,0.58246,0.470823


## Want to see dtype: float64 to make sure it parsed the file correctly
Note that there are NaN, can select the NaN values if needed

In [4]:
phospho_df["T20"]

0     0.131194
1     0.031819
2    -0.077330
3     0.020769
4    -0.081125
5    -0.062625
6          NaN
7     0.470823
8    -0.024663
9     0.001154
10    0.023894
11    0.075053
12   -0.010004
13    0.016924
Name: T20, dtype: float64

In [5]:
phospho_df[phospho_df["T20"].isnull()]

Unnamed: 0,id,Modified Peptide Sequence,PhosphoGroups,Significance,pSite,T00,T05,T10,T15,T20
6,18073,LLS(ph)SHLK,1,,S655,0.221692,0.07738,0.388686,0.098824,


In [6]:
if(np.isnan(phospho_df["T20"][6])):
    print "Found NaN"

Found NaN


## Grab a block of columns or the column names

In [7]:
phospho_df.loc[:,'T00':'T20']

Unnamed: 0,T00,T05,T10,T15,T20
0,-0.041432,0.014212,0.064607,0.132248,0.131194
1,-0.004523,-0.020311,0.022332,0.002595,0.031819
2,-0.095188,-0.039221,-0.043705,-0.042115,-0.07733
3,-0.054797,0.098554,0.041944,-0.105411,0.020769
4,-0.113489,-0.100685,-0.020019,-0.034352,-0.081125
5,-0.093079,-0.057932,-0.013072,0.058247,-0.062625
6,0.221692,0.07738,0.388686,0.098824,
7,-0.134129,0.469573,0.556307,0.58246,0.470823
8,-0.028131,-0.014631,-0.027719,-0.011952,-0.024663
9,-0.031858,0.073957,-0.045327,0.043624,0.001154


In [8]:
for colname in phospho_df.loc[:,'T00':'T20'].columns.values:
    print colname

T00
T05
T10
T15
T20


## Create new columns that take the absolute value of the log-scale fold change
Use a lambda function that takes the absolute value of the log fold change in the current column

In [9]:
for colname in phospho_df.loc[:,'T00':'T20'].columns.values:
    new_col = colname + '_ABS'
    phospho_df[new_col] = phospho_df.apply(lambda row: abs(row[colname]), axis=1)

In [10]:
phospho_df.head()

Unnamed: 0,id,Modified Peptide Sequence,PhosphoGroups,Significance,pSite,T00,T05,T10,T15,T20,T00_ABS,T05_ABS,T10_ABS,T15_ABS,T20_ABS
0,18075,NVVDENLINDMDS(ph)EDAHK,1,,S618,-0.041432,0.014212,0.064607,0.132248,0.131194,0.041432,0.014212,0.064607,0.132248,0.131194
1,18058,ADDEEDLS(ph)DENIQPELR,1,,S720,-0.004523,-0.020311,0.022332,0.002595,0.031819,0.004523,0.020311,0.022332,0.002595,0.031819
2,18078,SNS(ph)IDYAK,1,,S467,-0.095188,-0.039221,-0.043705,-0.042115,-0.07733,0.095188,0.039221,0.043705,0.042115,0.07733
3,18077,QHLSDIT(ph)LEER,1,,T633,-0.054797,0.098554,0.041944,-0.105411,0.020769,0.054797,0.098554,0.041944,0.105411,0.020769
4,18068,GLDDESGPTHGNDS(ph)GNHR,1,,S215,-0.113489,-0.100685,-0.020019,-0.034352,-0.081125,0.113489,0.100685,0.020019,0.034352,0.081125


## Now create another new column that takes the min over the 'ABS' columns
Example in FilterNetworkExpression122915.html