Uses the thresholds provided from the original data to determine if proteins are significantly phosphorylated at each time point.

Input: - mmc2.xlsx - original dataset with time series
       - FC_thresholds.xlsx - originial dataset with thresholds for each time point
       - result_optimalForest.sif - no longer necessary need to refactor
Output: 
    - firstScores.tsv - first scores file comparing time point to first time point
    - prevScores.tsv - comparing this time point to previous time point
    - network.tsv - believed to be unnecessary file need to refactor code (NOW REMOVED)
    - timeSeries.tsv - file with the proteins and time series phosphorylation data
    - finalOptimalForest.sif - unnecessary file need to refactor (NOW REMOVED)
    - peptideMap.tsv - network input file with only sequences and identification

# Load in libraries and files

In [2]:
import os.path
import matplotlib.pyplot as plt
import pandas as pd

FILEPATH = r'/home/dylan/Documents/HDD/Wisconsin/osmotic-stress/Notebooks/ApplyThresholds/'

temporalPhospho = FILEPATH + 'mmc2.xlsx'
thresholdPath = FILEPATH + 'FC_thresholds.xlsx'

df = pd.read_excel(temporalPhospho, header = 1)
dfThresh = pd.read_excel(thresholdPath, header = 1)

df.rename(columns={'Modified Peptide Sequence':'ModifiedPeptideSeq'}, inplace=True)
dfThresh.rename(columns = {'Threshold fo rphosphorylation' : 'PhosphorylationThresh'}, inplace = True)
dfThresh.rename(columns = {'Threshold for dephosphorylation' : 'dephosphorylationThresh'}, inplace = True)
dfThresh

Unnamed: 0,TimePoint,PhosphorylationThresh,dephosphorylationThresh
0,T00,0.396707,-0.377501
1,T05,0.370611,-0.400644
2,T10,0.333767,-0.374
3,T15,0.340391,-0.356125
4,T20,0.387142,-0.36857
5,T25,0.35682,-0.406866
6,T30,0.438825,-0.489524
7,T35,0.496411,-0.52048
8,T40,0.40294,-0.419063
9,T45,0.339707,-0.354889


# Concatenate Modified Peptide Sequence and the id together

In [3]:
df['ModifiedPeptideSeq'] = df.ModifiedPeptideSeq + "-" + df.id.map(str) 
print df['T00']

df['T00'] = df['T00'].fillna(0)
df = df.fillna('')

#df = df[df.T00 != 'NA']

df['T00']

0      -0.041432
1      -0.004523
2      -0.095188
3      -0.054797
4      -0.113489
5      -0.093079
6       0.221692
7      -0.134129
8      -0.028131
9      -0.031858
10      0.048655
11      0.060462
12      0.032242
13     -0.000765
14      0.002018
15     -0.109079
16      0.168514
17     -0.027013
18     -0.083065
19     -0.459794
20      0.031113
21      0.814755
22     -0.000823
23      0.146655
24     -0.019945
25      0.039419
26      0.015926
27      0.068464
28      0.019915
29      0.010923
          ...   
4307    0.013641
4308    0.114234
4309   -0.120153
4310    0.001010
4311    0.008487
4312   -0.006391
4313    0.198368
4314    0.157820
4315    0.035342
4316    0.087191
4317    0.013927
4318   -0.008682
4319    0.409581
4320    0.049491
4321         NaN
4322    0.272620
4323   -0.074775
4324   -0.030885
4325    0.177216
4326   -0.060743
4327    0.015069
4328         NaN
4329         NaN
4330         NaN
4331    0.044324
4332    0.206143
4333   -0.149249
4334   -0.0626

0      -0.041432
1      -0.004523
2      -0.095188
3      -0.054797
4      -0.113489
5      -0.093079
6       0.221692
7      -0.134129
8      -0.028131
9      -0.031858
10      0.048655
11      0.060462
12      0.032242
13     -0.000765
14      0.002018
15     -0.109079
16      0.168514
17     -0.027013
18     -0.083065
19     -0.459794
20      0.031113
21      0.814755
22     -0.000823
23      0.146655
24     -0.019945
25      0.039419
26      0.015926
27      0.068464
28      0.019915
29      0.010923
          ...   
4307    0.013641
4308    0.114234
4309   -0.120153
4310    0.001010
4311    0.008487
4312   -0.006391
4313    0.198368
4314    0.157820
4315    0.035342
4316    0.087191
4317    0.013927
4318   -0.008682
4319    0.409581
4320    0.049491
4321    0.000000
4322    0.272620
4323   -0.074775
4324   -0.030885
4325    0.177216
4326   -0.060743
4327    0.015069
4328    0.000000
4329    0.000000
4330    0.000000
4331    0.044324
4332    0.206143
4333   -0.149249
4334   -0.0626

# Obtain just the relevant columns for the data

In [4]:
## Can drop T00
FoldData = df.ix[:,15:28] #T00-T60

df = df[['ModifiedPeptideSeq', 'ORF']]

df = pd.concat([df, FoldData], axis = 1)

#Needed for input file with the sequence and time series data
dfTimeSeries = df.copy()
dfTimeSeries = dfTimeSeries.drop('ORF', 1)
dfTimeSeries

Unnamed: 0,ModifiedPeptideSeq,T00,T05,T10,T15,T20,T25,T30,T35,T40,T45,T50,T55,T60
0,NVVDENLINDMDS(ph)EDAHK-18075,-0.041432,0.0142124,0.064607,0.132248,0.131194,0.0641931,-0.134604,0.0251706,0.0827026,0.161694,-0.113489,0.093425,-0.0712393
1,ADDEEDLS(ph)DENIQPELR-18058,-0.004523,-0.0203112,0.0223323,0.00259452,0.0318186,0.00690837,0.0517201,0.0237521,0.0336521,0.00518438,0.0600474,0.033793,0.0237521
2,SNS(ph)IDYAK-18078,-0.095188,-0.0392214,-0.0437054,-0.0421151,-0.0773302,-0.0671829,-0.135412,-0.14618,-0.135,-0.192249,-0.129402,-0.224857,-0.226341
3,QHLSDIT(ph)LEER-18077,-0.054797,0.0985543,0.0419438,-0.105411,0.0207689,-0.0346331,-0.0044793,-0.134588,0.0848807,-0.0449551,-0.0752312,-0.0762651,-0.082194
4,GLDDESGPTHGNDS(ph)GNHR-18068,-0.113489,-0.100685,-0.0200186,-0.0343524,-0.0811253,-0.165159,-0.0460569,-0.00390054,0.0803848,-0.117521,0.0619835,-0.0232551,-0.00526099
5,LDFVTDDLEYHLANTHPDDT(ph)NDKVESR-18070,-0.093079,-0.0579322,-0.0130722,0.0582472,-0.0626255,0.0184917,0.068602,-0.00589824,-0.206628,0.0694272,0.086512,-0.235875,
6,LLS(ph)SHLK-18073,0.221692,0.0773797,0.388686,0.0988238,,0.0255958,-0.042858,,-0.0361264,0.381283,-0.111337,-0.09068,-0.179825
7,S(ph)KFEGLAASEKEEEEMENK-9982,-0.134129,0.469573,0.556307,0.58246,0.470823,0.642286,0.709467,1.01314,0.770533,0.695727,0.815411,0.857663,0.559247
8,TPS(ph)PAPAAK-9993,-0.028131,-0.0146307,-0.0277191,-0.0119516,-0.0246633,-0.0117043,-0.0232258,-0.00439247,-0.0174901,-0.00366911,-0.0221999,-0.023827,-0.0124463
9,NEKS(ph)PAQLWAER-9974,-0.031858,0.0739573,-0.0453272,0.0436245,0.00115369,-0.130618,0.108892,0.0433445,0.0957219,0.0655723,-0.00523203,0.126973,0.047678


In [5]:
#df = df.fillna('NA')

#df = df[df.T00 != 'NA']

df['T00']

0      -0.041432
1      -0.004523
2      -0.095188
3      -0.054797
4      -0.113489
5      -0.093079
6       0.221692
7      -0.134129
8      -0.028131
9      -0.031858
10      0.048655
11      0.060462
12      0.032242
13     -0.000765
14      0.002018
15     -0.109079
16      0.168514
17     -0.027013
18     -0.083065
19     -0.459794
20      0.031113
21      0.814755
22     -0.000823
23      0.146655
24     -0.019945
25      0.039419
26      0.015926
27      0.068464
28      0.019915
29      0.010923
          ...   
4307    0.013641
4308    0.114234
4309   -0.120153
4310    0.001010
4311    0.008487
4312   -0.006391
4313    0.198368
4314    0.157820
4315    0.035342
4316    0.087191
4317    0.013927
4318   -0.008682
4319    0.409581
4320    0.049491
4321    0.000000
4322    0.272620
4323   -0.074775
4324   -0.030885
4325    0.177216
4326   -0.060743
4327    0.015069
4328    0.000000
4329    0.000000
4330    0.000000
4331    0.044324
4332    0.206143
4333   -0.149249
4334   -0.0626

# Helper function to determine 0's and 1's based on threshold

In [6]:
# Helper function
def threshold(fc_value, fc_min, fc_max):
    # Return 1 or 0
    #print fc_value
    if(fc_value > fc_max):
        return 0
    elif(fc_value < fc_min):
        return 0
    else:
        return 1
    
#WHAT TO DO ABOUT NA CASES???


# Loop through the values for different time points and compare to threshold

In [7]:
# Loop over time points


for time in dfThresh['TimePoint']: 
    fc_max = dfThresh.loc[dfThresh['TimePoint'] == time, 'PhosphorylationThresh'].values
    fc_min = dfThresh.loc[dfThresh['TimePoint'] == time, 'dephosphorylationThresh'].values

    df[time] = df.apply(lambda row: threshold(row[time], fc_min, fc_max), axis=1)



# Drop ORF column

In [8]:
dfPeptideMap = df.ix[:,'ModifiedPeptideSeq':'ORF'] #Need network input file with only sequences and identification

df = df.drop('ORF', 1)
df

Unnamed: 0,ModifiedPeptideSeq,T00,T05,T10,T15,T20,T25,T30,T35,T40,T45,T50,T55,T60
0,NVVDENLINDMDS(ph)EDAHK-18075,1,1,1,1,1,1,1,1,1,1,1,1,1
1,ADDEEDLS(ph)DENIQPELR-18058,1,1,1,1,1,1,1,1,1,1,1,1,1
2,SNS(ph)IDYAK-18078,1,1,1,1,1,1,1,1,1,1,1,1,1
3,QHLSDIT(ph)LEER-18077,1,1,1,1,1,1,1,1,1,1,1,1,1
4,GLDDESGPTHGNDS(ph)GNHR-18068,1,1,1,1,1,1,1,1,1,1,1,1,1
5,LDFVTDDLEYHLANTHPDDT(ph)NDKVESR-18070,1,1,1,1,1,1,1,1,1,1,1,1,0
6,LLS(ph)SHLK-18073,1,1,0,1,0,1,1,0,1,0,1,1,1
7,S(ph)KFEGLAASEKEEEEMENK-9982,1,0,0,0,0,0,0,0,0,0,0,0,0
8,TPS(ph)PAPAAK-9993,1,1,1,1,1,1,1,1,1,1,1,1,1
9,NEKS(ph)PAQLWAER-9974,1,1,1,1,1,1,1,1,1,1,1,1,1


# Check some values

In [9]:
#assert df.iloc[7]['T05'] == 0, "Incorrect value for " + df.iloc[7]['ORF']
#assert df.iloc[4333]['T40'] == 0, "Incorrect value for " + df.iloc[4333]['ORF']

# Create df for temporary prevscores data

In [10]:
prevScoresdf = df.copy()
peptideSeq = df.ix[:,'ModifiedPeptideSeq':'ModifiedPeptideSeq'] #Needed to add peptide sequence data

prevScoresdf.ix[:,'T00':] = 1 #select T05-T60

prevScoresdf = prevScoresdf.ix[:,'T05':]
prevScoresdf.insert(0, 'ModifiedPepSeq', peptideSeq) #insert peptide sequence column

prevScoresdf

Unnamed: 0,ModifiedPepSeq,T05,T10,T15,T20,T25,T30,T35,T40,T45,T50,T55,T60
0,NVVDENLINDMDS(ph)EDAHK-18075,1,1,1,1,1,1,1,1,1,1,1,1
1,ADDEEDLS(ph)DENIQPELR-18058,1,1,1,1,1,1,1,1,1,1,1,1
2,SNS(ph)IDYAK-18078,1,1,1,1,1,1,1,1,1,1,1,1
3,QHLSDIT(ph)LEER-18077,1,1,1,1,1,1,1,1,1,1,1,1
4,GLDDESGPTHGNDS(ph)GNHR-18068,1,1,1,1,1,1,1,1,1,1,1,1
5,LDFVTDDLEYHLANTHPDDT(ph)NDKVESR-18070,1,1,1,1,1,1,1,1,1,1,1,1
6,LLS(ph)SHLK-18073,1,1,1,1,1,1,1,1,1,1,1,1
7,S(ph)KFEGLAASEKEEEEMENK-9982,1,1,1,1,1,1,1,1,1,1,1,1
8,TPS(ph)PAPAAK-9993,1,1,1,1,1,1,1,1,1,1,1,1
9,NEKS(ph)PAQLWAER-9974,1,1,1,1,1,1,1,1,1,1,1,1


# Output to file 

In [11]:
#firstScoresdf = df.ix[:,'T05':]


df.drop('T00', axis=1, inplace=True)

path = FILEPATH + 'firstScores.tsv'
df.to_csv(path, index = False, header = True, sep = '\t')

path = FILEPATH + 'prevScores.tsv'
prevScoresdf.to_csv(path, index = False, header = True, sep = '\t')

path = FILEPATH + 'timeSeries.tsv'
dfTimeSeries.to_csv(path, index = False, header = True, sep = '\t')

path = FILEPATH + 'peptideMap.tsv'
dfPeptideMap.to_csv(path, index = False, header = False, sep = '\t')

In [12]:
print df.iloc[3235]

ModifiedPeptideSeq    LSDANST(ph)FK-17759
T05                                     0
T10                                     0
T15                                     0
T20                                     0
T25                                     1
T30                                     1
T35                                     1
T40                                     0
T45                                     0
T50                                     0
T55                                     0
T60                                     1
Name: 3235, dtype: object


In [13]:
print df.iloc[4328]

ModifiedPeptideSeq    VESHVIILNDPAS(ph)PASNTSEATSSK-12913
T05                                                     1
T10                                                     1
T15                                                     1
T20                                                     1
T25                                                     1
T30                                                     1
T35                                                     1
T40                                                     1
T45                                                     1
T50                                                     1
T55                                                     1
T60                                                     1
Name: 4328, dtype: object
