# Combine multiple protein and peptide annotations to create a single annotation for Cytoscape - April 4, 2017

Use the TPS input and output.

**Need to add default values for missing data**

**Need to validate the output and add assert statements**

## Set up file locations

In [1]:
% pylab inline

import os
import util
import pandas as pd

baseDir = os.path.join('..','..')
tps_in_dir = os.path.join(baseDir, 'Notebooks', 'ApplyThresholds')
tps_out_dir = os.path.join(baseDir, 'Results', 'TPS output', 'TPS_2-19-2017')

# Use the version with the header line
pepMapFile = os.path.join(tps_in_dir, 'peptideMapHeader.tsv')
pepFirstFile = os.path.join(tps_in_dir, 'firstScores.tsv')
pepPrevFile = os.path.join(tps_in_dir, 'prevScores.tsv')
timeSeriesFile = os.path.join(tps_in_dir, 'timeSeries.tsv')

windowsFile = os.path.join(tps_out_dir, 'activity-windows.tsv')
networkFile = os.path.join(tps_out_dir, 'output.sif')

# Use the Kitano 2016 pathway as the gold standard
goldStandardFile = os.path.join(baseDir, 'data', 'evaluation', 'Kitano2016_ORFs.txt')

styleTemplateFile = 'tps_style_template.xml'

out_dir = '.'
outFile = os.path.join(out_dir, 'kanshin2015-cytoscape-annotations.txt')
outStyleFile = os.path.join(out_dir, 'tps_style.xml')

Populating the interactive namespace from numpy and matplotlib


## Call the function to load the temporal data and gold standard annotations and merge them into a Cytoscape-readable format

In [2]:
pvalThresh = 0.01 # Same threhsold used in TPS but it doesn't matter with binary "p-values"
logTransform = False
pepsPerProt = util.PrepTemporalCytoscapeTPS(pepMapFile, timeSeriesFile, pepFirstFile,
                             pepPrevFile, windowsFile, networkFile,
                             goldStandardFile, pvalThresh, logTransform, styleTemplateFile,
                             outFile, outStyleFile) # don't provide logDefault or addZero

Loaded protein id map for 4337 peptides
Loaded 4337 peptides and 12 scores in the first and previous score files
Loaded prizes for 4337 peptides
2328 peptides with significant prizes (>= 2)
['-0.0414319732335', '0.014212444812104711', '0.06460697529739193', '0.13224779829843977', '0.13119435170845012', '0.06419306192749237', '-0.13460373753535176', '0.025170575734906053', '0.08270258933024935', '0.16169423543613154', '-0.11348887132475236', '0.09342495941910292', '-0.07123932561939318']
['-0.00452271722862', '-0.020311184989869394', '0.02233230009805639', '0.002594516708452323', '0.031818625826357146', '0.006908369342177494', '0.05172011644732763', '0.02375213589899498', '0.03365205248999707', '0.005184375859723258', '0.0600473836699389', '0.033792988849303096', '0.02375213589899498']
['-0.0951883824741', '-0.03922142339575775', '-0.043705396895833915', '-0.0421151096788827', '-0.07733021248373827', '-0.0671828972581494', '-0.13541169050387866', '-0.14617996124757043', '-0.134999736374

ValueError: could not convert string to float: 

## Plot the number of peptides per protein

In [None]:
hist(pepsPerProt)
for count in range(1,max(pepsPerProt)+1):
    print "%d proteins with %d peptide(s)" % (sum([x == count for x in pepsPerProt]), count)

## Test parts of the Cytoscape table

**Update for yeast data**

In [None]:
cytoDf = pd.read_csv(outFile, sep='\t')
cytoDf.head()

In [None]:
# Verify that all TPS nodes are in the annotation file
tpsNodes = set()
with open(networkFile) as net_f:
    for line in net_f:
        parts = line.strip().split('\t')
        tpsNodes.add(parts[0])
        tpsNodes.add(parts[2])

subsetDf = cytoDf[cytoDf['NodeType'] != 'Excluded']
subsetNodes = set(subsetDf['Protein'].values)
assert tpsNodes == subsetNodes
print 'All TPS nodes are in the annotation file'

# Verify all proteins that have peptides listed in the annotation file
# are in the peptide-protein mapping.  The peptide-protein mapping
# contains proteins from any of the three replicates though, whereas the
# annotation file only contains those in all three.
prizeNodes = set()
with open(pepMapFile) as prize_f:
    # Skip the header
    prize_f.next()
    for line in prize_f:
        parts = line.strip().split('\t')
        prizeNodes.add(parts[1])
# All proteins in the annotation file with a significant or insignificant peptide
subsetNodes = set(cytoDf.loc[cytoDf['SigPeptide1'].notnull() | cytoDf['InsigPeptide1'].notnull(), 'Protein'].values)
assert len(subsetNodes) == 701
assert subsetNodes.issubset(prizeNodes)
print 'All annotation file proteins are in the peptide mapping file'

# Verify all nodes in the annotation file are from the TPS network or the prize nodes
allNodes = set()
allNodes.update(tpsNodes)
allNodes.update(prizeNodes)
subsetNodes = set(cytoDf['Protein'].values)
assert subsetNodes.issubset(allNodes)
print 'All nodes in the annotation file are from TPS or the prize nodes'

# Verify the reference pathway overlap matches what was observed previously
assert len(cytoDf[cytoDf['ReferencePathway'] & (cytoDf['NodeType'] != 'Excluded')]) == 34
print 'All nodes in the TPS pathway and EGFR reference pathways are in the annotation file'

# Verify first activity times
assert cytoDf.loc[cytoDf['Protein'] == 'DYR1B_HUMAN', 'FirstActive'].values == '128min'
assert cytoDf.loc[cytoDf['Protein'] == 'ARHG7_HUMAN', 'FirstActive'].values == '32min'
assert len(cytoDf[cytoDf['FirstActive'] == '2min']) == 189
assert len(cytoDf[cytoDf['FirstActive'] == '64min']) == 8
assert len(cytoDf[cytoDf['FirstActive'] == 'Not active']) == 428
print 'Nodes in annotation file have expected first activity time'

# Verify all excluded nodes have no first activity time
assert np.all((cytoDf['NodeType'] == 'Excluded') == (cytoDf['FirstActive'] == 'Not active'))
print 'All excluded nodes have no activity time and vice versa'

# Verify that these proteins have the expected number of significant and insignificant peptides
assert sum(cytoDf.loc[cytoDf['Protein'] == 'EGFR_HUMAN', 'SigPeptide1':'SigPeptide6'].notnull().values) == 6
assert sum(cytoDf.loc[cytoDf['Protein'] == 'EGFR_HUMAN', 'InsigPeptide1':'InsigPeptide10'].notnull().values) == 2

assert sum(cytoDf.loc[cytoDf['Protein'] == 'NUCKS_HUMAN', 'SigPeptide1':'SigPeptide6'].notnull().values) == 2
assert sum(cytoDf.loc[cytoDf['Protein'] == 'NUCKS_HUMAN', 'InsigPeptide1':'InsigPeptide10'].notnull().values) == 10

assert sum(cytoDf.loc[cytoDf['Protein'] == 'CBL_HUMAN', 'SigPeptide1':'SigPeptide6'].notnull().values) == 3
assert sum(cytoDf.loc[cytoDf['Protein'] == 'CBL_HUMAN', 'InsigPeptide1':'InsigPeptide10'].notnull().values) == 2
print 'Proteins have expected number of peptides'

def MinLogFC(row):
    fcs = []
    for timeSeries in row:
        if not str(timeSeries) == 'nan':
            fcs.extend(map(float, timeSeries.split(', ')))
    return min(fcs)

# Verify min fold change of another protein
assert np.isclose(MinLogFC(cytoDf.loc[cytoDf['Protein'] == 'GSK3A_HUMAN', 'SigPeptide1':'InsigPeptide10'].values[0]),-1.051833)
print 'Protein has expected min log2 fold change'

with open(outStyleFile) as f:
    styleContent = f.read()
    assert '$$$' not in styleContent
print 'Removed placeholders from style file'