Uses the thresholds provided from the original data to determine if proteins are significantly phosphorylated at each time point.

Input: - mmc2.xlsx - original dataset with time series
       - FC_thresholds.xlsx - originial dataset with thresholds for each time point
       - result_optimalForest.sif - no longer necessary need to refactor
Output: 
    - firstScores.tsv - first scores file comparing time point to first time point
    - prevScores.tsv - comparing this time point to previous time point
    - network.tsv - believed to be unnecessary file need to refactor code
    - timeSeries.tsv - file with the proteins and time series phosphorylation data
    - finalOptimalForest.sif - unnecessary file need to refactor
    - peptideMap.tsv - network input file with only sequences and identification

# Load in libraries and files

In [302]:
import os.path
import matplotlib.pyplot as plt
import pandas as pd

temporalPhospho = r'Z:/Yeast/mmc2.xlsx'
thresholdPath = r'Z:/Yeast/FC_thresholds.xlsx'

df = pd.read_excel(temporalPhospho, header = 1)
dfThresh = pd.read_excel(thresholdPath, header = 1)

df.rename(columns={'Modified Peptide Sequence':'ModifiedPeptideSeq'}, inplace=True)
dfThresh.rename(columns = {'Threshold fo rphosphorylation' : 'PhosphorylationThresh'}, inplace = True)
dfThresh.rename(columns = {'Threshold for dephosphorylation' : 'dephosphorylationThresh'}, inplace = True)
dfThresh

Unnamed: 0,TimePoint,PhosphorylationThresh,dephosphorylationThresh
0,T00,0.396707,-0.377501
1,T05,0.370611,-0.400644
2,T10,0.333767,-0.374
3,T15,0.340391,-0.356125
4,T20,0.387142,-0.36857
5,T25,0.35682,-0.406866
6,T30,0.438825,-0.489524
7,T35,0.496411,-0.52048
8,T40,0.40294,-0.419063
9,T45,0.339707,-0.354889


In [303]:
networkPath = r'Z:/Yeast/result_optimalForest.sif'
dfNetwork = pd.read_csv(networkPath, sep = '\t', header = None)

dfNetwork[1] = 'N'

dfDirectedNetwork = dfNetwork.copy()

dfNetwork.drop(1, axis=1, inplace=True)
dfNetwork


Unnamed: 0,0,2
0,YBL016W,YDR103W
1,YBL016W,YHR084W
2,YBL016W,YOL127W
3,YBL016W,YPL049C
4,YBL105C,YJL095W
5,YBR059C,YGL153W
6,YBR160W,YAL024C
7,YBR160W,YBR102C
8,YBR160W,YDL017W
9,YBR160W,YDL028C


# Concatenate Modified Peptide Sequence and the id together

In [304]:
df['ModifiedPeptideSeq'] = df.ModifiedPeptideSeq + "-" + df.id.map(str) 

df = df.fillna('NA')

df = df[df.T00 != 'NA']

df

Unnamed: 0,id,ModifiedPeptideSeq,PhosphoGroups,Significance,pSite,MaxLocConf,AvrLocConf,ORF,SGD_ID,Uniprot,...,T15.3,T20.3,T25.3,T30.3,T35.3,T40.3,T45.3,T50.3,T55.3,T60.3
0,18075,NVVDENLINDMDS(ph)EDAHK-18075,1,,S618,1,1,YKL112W,S000001595,P14164,...,6.4209,6.79686,6.7347,6.46942,6.40666,6.56476,6.44585,6.43567,6.59324,6.40285
1,18058,ADDEEDLS(ph)DENIQPELR-18058,1,,S720,1,1,YKL112W,S000001595,P14164,...,7.98176,8.24586,8.15715,8.01515,8.05434,8.03306,8.07671,8.0293,8.01778,7.91105
2,18078,SNS(ph)IDYAK-18078,1,,S467,1,0.999923,YKL112W,S000001595,P14164,...,6.27916,6.50946,6.51408,6.16104,6.2116,6.43297,6.50958,6.26014,6.17638,5.8094
3,18077,QHLSDIT(ph)LEER-18077,1,,T633,1,0.969,YKL112W,S000001595,P14164,...,6.18988,6.06867,6.34056,6.08415,5.7365,5.82223,5.80107,5.7704,6.01435,5.66877
4,18068,GLDDESGPTHGNDS(ph)GNHR-18068,1,,S215,1,0.92525,YKL112W,S000001595,P14164,...,5.03226,4.56592,4.90407,5.40273,5.15039,5.13469,5.31702,4.91852,5.41963,5.05077
5,18070,LDFVTDDLEYHLANTHPDDT(ph)NDKVESR-18070,1,,T153,0.826,0.7855,YKL112W,S000001595,P14164,...,5.51829,4.73759,4.88582,5.35614,5.32915,5.60244,5.63509,5.18665,5.59145,
6,18073,LLS(ph)SHLK-18073,1,,S655,0.971,0.777,YKL112W,S000001595,P14164,...,5.2694,,5.67585,5.08164,,5.44116,5.30797,4.79496,5.30494,4.82557
7,9982,S(ph)KFEGLAASEKEEEEMENK-9982,1,FC,S357,0.974,0.90325,YCR088W,S000000684,P15891,...,5.71925,5.75869,5.96305,5.73103,5.55017,5.42236,5.62015,5.49693,5.59128,4.53372
8,9993,TPS(ph)PAPAAK-9993,1,,S183,1,1,YCR088W,S000000684,P15891,...,7.08821,7.02837,7.17287,6.55745,6.94109,6.89891,6.91322,6.73356,6.79575,6.31821
9,9974,NEKS(ph)PAQLWAER-9974,1,,S313,1,1,YCR088W,S000000684,P15891,...,6.17415,6.27667,6.41736,5.89887,5.52089,6.0003,5.80446,5.9878,6.21362,5.91107


# Obtain just the relevant columns for the data

In [305]:
## Can drop T00
FoldData = df.ix[:,15:28] #T00-T60

df = df[['ModifiedPeptideSeq', 'ORF']]

df = pd.concat([df, FoldData], axis = 1)

#Needed for input file with the sequence and time series data
dfTimeSeries = df.copy()
dfTimeSeries = dfTimeSeries.drop('ORF', 1)
dfTimeSeries

Unnamed: 0,ModifiedPeptideSeq,T00,T05,T10,T15,T20,T25,T30,T35,T40,T45,T50,T55,T60
0,NVVDENLINDMDS(ph)EDAHK-18075,-0.041432,0.0142124,0.064607,0.132248,0.131194,0.0641931,-0.134604,0.0251706,0.0827026,0.161694,-0.113489,0.093425,-0.0712393
1,ADDEEDLS(ph)DENIQPELR-18058,-0.00452272,-0.0203112,0.0223323,0.00259452,0.0318186,0.00690837,0.0517201,0.0237521,0.0336521,0.00518438,0.0600474,0.033793,0.0237521
2,SNS(ph)IDYAK-18078,-0.0951884,-0.0392214,-0.0437054,-0.0421151,-0.0773302,-0.0671829,-0.135412,-0.14618,-0.135,-0.192249,-0.129402,-0.224857,-0.226341
3,QHLSDIT(ph)LEER-18077,-0.0547968,0.0985543,0.0419438,-0.105411,0.0207689,-0.0346331,-0.0044793,-0.134588,0.0848807,-0.0449551,-0.0752312,-0.0762651,-0.082194
4,GLDDESGPTHGNDS(ph)GNHR-18068,-0.113489,-0.100685,-0.0200186,-0.0343524,-0.0811253,-0.165159,-0.0460569,-0.00390054,0.0803848,-0.117521,0.0619835,-0.0232551,-0.00526099
5,LDFVTDDLEYHLANTHPDDT(ph)NDKVESR-18070,-0.0930786,-0.0579322,-0.0130722,0.0582472,-0.0626255,0.0184917,0.068602,-0.00589824,-0.206628,0.0694272,0.086512,-0.235875,
6,LLS(ph)SHLK-18073,0.221692,0.0773797,0.388686,0.0988238,,0.0255958,-0.042858,,-0.0361264,0.381283,-0.111337,-0.09068,-0.179825
7,S(ph)KFEGLAASEKEEEEMENK-9982,-0.134129,0.469573,0.556307,0.58246,0.470823,0.642286,0.709467,1.01314,0.770533,0.695727,0.815411,0.857663,0.559247
8,TPS(ph)PAPAAK-9993,-0.0281309,-0.0146307,-0.0277191,-0.0119516,-0.0246633,-0.0117043,-0.0232258,-0.00439247,-0.0174901,-0.00366911,-0.0221999,-0.023827,-0.0124463
9,NEKS(ph)PAQLWAER-9974,-0.0318576,0.0739573,-0.0453272,0.0436245,0.00115369,-0.130618,0.108892,0.0433445,0.0957219,0.0655723,-0.00523203,0.126973,0.047678


In [306]:
#df = df.fillna('NA')

#df = df[df.T00 != 'NA']

df

Unnamed: 0,ModifiedPeptideSeq,ORF,T00,T05,T10,T15,T20,T25,T30,T35,T40,T45,T50,T55,T60
0,NVVDENLINDMDS(ph)EDAHK-18075,YKL112W,-0.041432,0.0142124,0.064607,0.132248,0.131194,0.0641931,-0.134604,0.0251706,0.0827026,0.161694,-0.113489,0.093425,-0.0712393
1,ADDEEDLS(ph)DENIQPELR-18058,YKL112W,-0.00452272,-0.0203112,0.0223323,0.00259452,0.0318186,0.00690837,0.0517201,0.0237521,0.0336521,0.00518438,0.0600474,0.033793,0.0237521
2,SNS(ph)IDYAK-18078,YKL112W,-0.0951884,-0.0392214,-0.0437054,-0.0421151,-0.0773302,-0.0671829,-0.135412,-0.14618,-0.135,-0.192249,-0.129402,-0.224857,-0.226341
3,QHLSDIT(ph)LEER-18077,YKL112W,-0.0547968,0.0985543,0.0419438,-0.105411,0.0207689,-0.0346331,-0.0044793,-0.134588,0.0848807,-0.0449551,-0.0752312,-0.0762651,-0.082194
4,GLDDESGPTHGNDS(ph)GNHR-18068,YKL112W,-0.113489,-0.100685,-0.0200186,-0.0343524,-0.0811253,-0.165159,-0.0460569,-0.00390054,0.0803848,-0.117521,0.0619835,-0.0232551,-0.00526099
5,LDFVTDDLEYHLANTHPDDT(ph)NDKVESR-18070,YKL112W,-0.0930786,-0.0579322,-0.0130722,0.0582472,-0.0626255,0.0184917,0.068602,-0.00589824,-0.206628,0.0694272,0.086512,-0.235875,
6,LLS(ph)SHLK-18073,YKL112W,0.221692,0.0773797,0.388686,0.0988238,,0.0255958,-0.042858,,-0.0361264,0.381283,-0.111337,-0.09068,-0.179825
7,S(ph)KFEGLAASEKEEEEMENK-9982,YCR088W,-0.134129,0.469573,0.556307,0.58246,0.470823,0.642286,0.709467,1.01314,0.770533,0.695727,0.815411,0.857663,0.559247
8,TPS(ph)PAPAAK-9993,YCR088W,-0.0281309,-0.0146307,-0.0277191,-0.0119516,-0.0246633,-0.0117043,-0.0232258,-0.00439247,-0.0174901,-0.00366911,-0.0221999,-0.023827,-0.0124463
9,NEKS(ph)PAQLWAER-9974,YCR088W,-0.0318576,0.0739573,-0.0453272,0.0436245,0.00115369,-0.130618,0.108892,0.0433445,0.0957219,0.0655723,-0.00523203,0.126973,0.047678


# Helper function to determine 0's and 1's based on threshold

In [307]:
# Helper function
def threshold(fc_value, fc_min, fc_max):
    # Return 1 or 0
    #print fc_value
    if(fc_value > fc_max):
        return 0
    elif(fc_value < fc_min):
        return 0
    else:
        return 1
    
#WHAT TO DO ABOUT NA CASES???


# Loop through the values for different time points and compare to threshold

In [308]:
# Loop over time points


for time in dfThresh['TimePoint']: 
    fc_max = dfThresh.loc[dfThresh['TimePoint'] == time, 'PhosphorylationThresh'].values
    fc_min = dfThresh.loc[dfThresh['TimePoint'] == time, 'dephosphorylationThresh'].values

    df[time] = df.apply(lambda row: threshold(row[time], fc_min, fc_max), axis=1)



# Drop ORF column

In [309]:
dfPeptideMap = df.ix[:,'ModifiedPeptideSeq':'ORF'] #Need network input file with only sequences and identification

df = df.drop('ORF', 1)
df

Unnamed: 0,ModifiedPeptideSeq,T00,T05,T10,T15,T20,T25,T30,T35,T40,T45,T50,T55,T60
0,NVVDENLINDMDS(ph)EDAHK-18075,1,1,1,1,1,1,1,1,1,1,1,1,1
1,ADDEEDLS(ph)DENIQPELR-18058,1,1,1,1,1,1,1,1,1,1,1,1,1
2,SNS(ph)IDYAK-18078,1,1,1,1,1,1,1,1,1,1,1,1,1
3,QHLSDIT(ph)LEER-18077,1,1,1,1,1,1,1,1,1,1,1,1,1
4,GLDDESGPTHGNDS(ph)GNHR-18068,1,1,1,1,1,1,1,1,1,1,1,1,1
5,LDFVTDDLEYHLANTHPDDT(ph)NDKVESR-18070,1,1,1,1,1,1,1,1,1,1,1,1,0
6,LLS(ph)SHLK-18073,1,1,0,1,0,1,1,0,1,0,1,1,1
7,S(ph)KFEGLAASEKEEEEMENK-9982,1,0,0,0,0,0,0,0,0,0,0,0,0
8,TPS(ph)PAPAAK-9993,1,1,1,1,1,1,1,1,1,1,1,1,1
9,NEKS(ph)PAQLWAER-9974,1,1,1,1,1,1,1,1,1,1,1,1,1


# Check some values

In [310]:
#assert df.iloc[7]['T05'] == 0, "Incorrect value for " + df.iloc[7]['ORF']
#assert df.iloc[4333]['T40'] == 0, "Incorrect value for " + df.iloc[4333]['ORF']

# Create df for temporary prevscores data

In [311]:
prevScoresdf = df.copy()
peptideSeq = df.ix[:,'ModifiedPeptideSeq':'ModifiedPeptideSeq'] #Needed to add peptide sequence data

prevScoresdf.ix[:,'T00':] = 1 #select T05-T60

prevScoresdf = prevScoresdf.ix[:,'T05':]
prevScoresdf.insert(0, 'ModifiedPepSeq', peptideSeq) #insert peptide sequence column

prevScoresdf

Unnamed: 0,ModifiedPepSeq,T05,T10,T15,T20,T25,T30,T35,T40,T45,T50,T55,T60
0,NVVDENLINDMDS(ph)EDAHK-18075,1,1,1,1,1,1,1,1,1,1,1,1
1,ADDEEDLS(ph)DENIQPELR-18058,1,1,1,1,1,1,1,1,1,1,1,1
2,SNS(ph)IDYAK-18078,1,1,1,1,1,1,1,1,1,1,1,1
3,QHLSDIT(ph)LEER-18077,1,1,1,1,1,1,1,1,1,1,1,1
4,GLDDESGPTHGNDS(ph)GNHR-18068,1,1,1,1,1,1,1,1,1,1,1,1
5,LDFVTDDLEYHLANTHPDDT(ph)NDKVESR-18070,1,1,1,1,1,1,1,1,1,1,1,1
6,LLS(ph)SHLK-18073,1,1,1,1,1,1,1,1,1,1,1,1
7,S(ph)KFEGLAASEKEEEEMENK-9982,1,1,1,1,1,1,1,1,1,1,1,1
8,TPS(ph)PAPAAK-9993,1,1,1,1,1,1,1,1,1,1,1,1
9,NEKS(ph)PAQLWAER-9974,1,1,1,1,1,1,1,1,1,1,1,1


# Output to file 

In [312]:
#firstScoresdf = df.ix[:,'T05':]


df.drop('T00', axis=1, inplace=True)

path = r'Z:\Yeast\firstScores.tsv'
df.to_csv(path, index = False, header = True, sep = '\t')

path = r'Z:\Yeast\prevScores.tsv'
prevScoresdf.to_csv(path, index = False, header = True, sep = '\t')

path = r'Z:\Yeast\network.tsv'
dfNetwork.to_csv(path, index = False, header = False, sep = '\t')

path = r'Z:\Yeast\timeSeries.tsv'
dfTimeSeries.to_csv(path, index = False, header = True, sep = '\t')

path = r'Z:\Yeast\finalOptimalForest.sif'
dfDirectedNetwork.to_csv(path, index = False, header = False, sep = '\t')

path = r'Z:\Yeast\peptideMap.tsv'
dfPeptideMap.to_csv(path, index = False, header = False, sep = '\t')

In [313]:
print df.iloc[3235]

ModifiedPeptideSeq    IS(ph)TPYNAK-15291
T05                                    1
T10                                    1
T15                                    1
T20                                    1
T25                                    1
T30                                    1
T35                                    1
T40                                    1
T45                                    1
T50                                    1
T55                                    1
T60                                    0
Name: 3373, dtype: object


In [314]:
print df.iloc[4328]

IndexError: single positional indexer is out-of-bounds