Input: - mmc2.xlsx - original phosphorylation data
Output: - prizes.txt - prizes file for running PCSF

Finds the maximum phosphorylation change for each protein in the original dataset.

# Include libraries and read in file, using first line of text as headers

In [1]:
import os.path
import matplotlib.pyplot as plt
import pandas as pd


#r-escapes the whole string
FILEPATH = r'/home/dylan/Documents/HDD/Wisconsin/osmotic-stress/Notebooks/PrizesNotebook/'


Location = FILEPATH + 'mmc2.xlsx'

#check file exists
assert os.path.isfile(Location) == True, "File does not exist."

#Makes the header the first line of text
df = pd.read_excel(Location, header = 1)
#print df

# Declare another dataframe for just fold changes and apply absolute values

In [2]:
FoldData = df.ix[:,15:28] #T00-T60
#print FoldData
#Used to map the absolute value to each column in the dataframe
def absVal(x):
    valType = str(type(x))
    if valType == '<type \'float\'>':
        return abs(x)
    else:
        print type(x)
        return

FoldData = FoldData.applymap(absVal)
print FoldData

           T00       T05       T10       T15       T20       T25       T30  \
0     0.041432  0.014212  0.064607  0.132248  0.131194  0.064193  0.134604   
1     0.004523  0.020311  0.022332  0.002595  0.031819  0.006908  0.051720   
2     0.095188  0.039221  0.043705  0.042115  0.077330  0.067183  0.135412   
3     0.054797  0.098554  0.041944  0.105411  0.020769  0.034633  0.004479   
4     0.113489  0.100685  0.020019  0.034352  0.081125  0.165159  0.046057   
5     0.093079  0.057932  0.013072  0.058247  0.062625  0.018492  0.068602   
6     0.221692  0.077380  0.388686  0.098824       NaN  0.025596  0.042858   
7     0.134129  0.469573  0.556307  0.582460  0.470823  0.642286  0.709467   
8     0.028131  0.014631  0.027719  0.011952  0.024663  0.011704  0.023226   
9     0.031858  0.073957  0.045327  0.043624  0.001154  0.130618  0.108892   
10    0.048655  0.048236  0.020342  0.028993  0.023894  0.064883  0.003351   
11    0.060462  0.037874  0.094777  0.073409  0.075053  0.132116

# Find highest of each row in FoldData dataframe

In [3]:
FoldData = FoldData.max(axis = 1)
FoldData

0       0.161694
1       0.060047
2       0.226341
3       0.134588
4       0.165159
5       0.235875
6       0.388686
7       1.013141
8       0.028131
9       0.130618
10      0.309091
11      0.192573
12      0.061154
13      0.098907
14      0.343351
15      0.279161
16      0.492622
17      0.240620
18      0.376732
19      0.459794
20      0.472800
21      0.828144
22      0.118664
23      0.209545
24      0.291627
25      0.318884
26      0.110497
27      0.118094
28      0.205768
29      0.212258
          ...   
4307    0.076969
4308    0.404496
4309    0.283106
4310    0.631288
4311    0.134192
4312    0.101954
4313    0.198368
4314    0.257976
4315    0.076012
4316    0.141694
4317    0.117429
4318    0.278698
4319    2.503159
4320    1.599899
4321    1.706276
4322    0.614757
4323    0.201634
4324    0.390983
4325    0.275245
4326    0.187658
4327    0.299948
4328    0.310573
4329    0.156267
4330    0.248292
4331    0.117562
4332    0.206143
4333    0.453860
4334    0.3678

# Drop unneccesary columns from original frame

### Concatenate FoldData dataframe with the original dataframe

In [4]:
df = df['ORF']

df = pd.concat([df, FoldData], axis = 1)

df

Unnamed: 0,ORF,0
0,YKL112W,0.161694
1,YKL112W,0.060047
2,YKL112W,0.226341
3,YKL112W,0.134588
4,YKL112W,0.165159
5,YKL112W,0.235875
6,YKL112W,0.388686
7,YCR088W,1.013141
8,YCR088W,0.028131
9,YCR088W,0.130618


# Group by ORF identification

In [5]:
group = df.groupby('ORF').max()
df = pd.DataFrame(group).reset_index()

# Delete punctuation in the file

In [6]:
df = df.replace({'-': ''}, regex=True)

# Check again for punctuation in file and correct values in dataframe

In [7]:
for index,row in df.iterrows():
    if(row['ORF'].find('-') != -1):
        print "There is punctuation in file for " + row['ORF']
        print ""
    if(row['ORF'] == 'YGR246C'):
        #assert row[0] == .426962447759, "Incorrect value for " + row['ORF']
        print row[0]
    if(row['ORF'] == 'YNL039W'):
        print row[0]
        print (row[0] == 2.16443159189)
        #assert row[0] == 2.16443159189003, "Incorrect value for " + row['ORF']



0.426962447759
2.16443159189
False


# Write prizes to files

In [8]:
path = FILEPATH + 'prizes.txt'

df.to_csv(path,  index = False, header = False, sep = '\t')

assert os.path.isfile(path) == True, "File was not written correctly to drive"

df

Unnamed: 0,ORF,0
0,YAL001C,0.492110
1,YAL003W,0.260869
2,YAL005C,0.166587
3,YAL011W,0.316115
4,YAL013W,0.341044
5,YAL016W,0.423740
6,YAL017W,0.147046
7,YAL019W,0.280362
8,YAL021C,0.147394
9,YAL022C,0.494780
