In [58]:
import pandas
import math
import re

########################################################
# DATA FILE NAMES
evidence_file = 'data/evidence.txt'
filtered_evidence_file = 'data/evidence_filtered.txt'
protein_groups_file = 'data/proteinGroups.txt'

In [59]:
def filter_tsv(input_file, output_file, col, pattern):
    if isinstance(pattern, str):
        pattern = re.compile(pattern)
        
    with open(output_file, 'w') as to_write:
        with open(input_file, 'r') as fp:
            index=-1
            n = 0
            for line in fp:
                parts = line.split('\t')
                if n == 0:
                    index = parts.index(col)
                    if index < 0:
                        raise ValueError(col + ' not found in ' + str(file))
                    to_write.write(line)
                else:
                    if pattern.match(parts[index]):
                        to_write.write(line)
                n += 1

# Test
# TODO delete output file
filter_tsv('test.tsv', 'output.tsv', 'evidence', re.compile('^keep'))
with open('output.tsv', 'r') as fp:
    assert ['a\tb\tevidence\tc\n', 'no\tno\tkeep\tno\n', 'no\tno\tkeeper\tno\n'] == fp.readlines()
import os
os.remove('output.tsv')
print('Tests passed')

Tests passed


In [60]:
# Now apply the filter to evidence.txt to get evidence_filtered.tsv
filter_tsv(evidence_file, filtered_evidence_file, 'Experiment', '^[Pp][Yy][Nn][Dd]')
print('Done.')

Done.


In [61]:

class SiteString:
    """
    A peptide sequence with one or more amino acids mapped to a floating-point score
    """
    
    sequence = None
    sites = None # Make a dict later
    
    def __init__(self, string):
        """
        string: A string from the data file like 'AFVNHM(8.97)M(-8.97)SSHSNHPGKR'
        """
        self.sites = {}
        pattern = '\\((-?[\\d.]+)\\)'
        offset = 0
        for match in re.finditer(pattern, string):
            self.sites[match.start() + offset] = float(match.group(1))
            offset -= len(match.group(0))
        self.sequence = re.sub(pattern, '', string)

        
def convert(cols, fn):
    for col in cols:
        try:
            df[col] = [fn(s) for s in df[col]]
        except ValueError as e:
            raise ValueError("Failed to convert column %s" % col) from e
    
def listify(df, cols, delim=';'):
    """Transform column type to list"""
    convert(cols, lambda s: [st.strip() for st in str(s).split(delim)])
    
def floatify(df, cols):
    """Transform column type to float"""
    convert(cols, lambda s: float(s))
    
def siteify(df, cols):
    """Transform column type to SiteString"""
    convert(cols, lambda s: SiteString(str(s)))
    
    
# Test scored site parser
import pytest
test_sites = SiteString('AFVNHM(8.97)MM(-8.97)SSHSNH(1.0)PGKR')
assert 'AFVNHMMMSSHSNHPGKR' == test_sites.sequence
assert {6: 8.97, 8: -8.97, 14: 1.0} == test_sites.sites
test_sites_2 = SiteString('')
assert '' == test_sites_2.sequence
assert {} == test_sites_2.sites

print("Tests passed")

Tests passed


In [48]:

def parse_evidence(file):
    
    df = pandas.read_table(file, header=0, index_col=0)

    # These are lists
    listify(df, [
            #'Protein group IDs',
            'MS/MS IDs',
            'Oxidation (M) site IDs',
            'Phospho (STY) site IDs',
            'Modifications',
            'Acetyl (Protein N-term)',
            'Proteins',
            'Leading proteins'
            #'Gene names',
            #'Protein Names'
            ])
    
    #listify(df, ['Protein Descriptions'], delim='|')
    
    # These have syntax like 'AFVNHM(8.97)M(-8.97)SSHSNHPGKR'
    siteify(df, [
            'Oxidation (M) Score Diffs',
            'Phospho (STY) Score Diffs',
            'Phospho (STY) Probabilities',
            'Oxidation (M) Probabilities'
    ])
    
    # Some contain "Infinity", which float() understands but Pandas doesn't
    floatify(df, ['PEP'])
    
    # Similarly, sometimes we get 'nan'
    # MS/MS IDs, Oxidation (M) site IDs, Phospho (STY) site IDs
    # TODO
    
    # Just use an empty list, not a list containing "Unmodified"
    df['Modifications'] = [[] if lst == ['Unmodified'] else lst for lst in df['Modifications']]

    return df
    

In [75]:
evidence = parse_evidence(filtered_evidence_file)
print(evidence.dtypes)
evidence

Length                                   int64
Modifications                           object
Modified sequence                       object
Oxidation (M) Probabilities             object
Phospho (STY) Probabilities             object
Oxidation (M) Score Diffs               object
Phospho (STY) Score Diffs               object
Acetyl (Protein N-term)                 object
Oxidation (M)                            int64
Phospho (STY)                            int64
Missed cleavages                         int64
Proteins                                object
Leading proteins                        object
Leading razor protein                   object
Type                                    object
Raw file                                object
Experiment                              object
MS/MS m/z                              float64
Charge                                   int64
m/z                                    float64
Mass                                   float64
Resolution   

Unnamed: 0_level_0,Length,Modifications,Modified sequence,Oxidation (M) Probabilities,Phospho (STY) Probabilities,Oxidation (M) Score Diffs,Phospho (STY) Score Diffs,Acetyl (Protein N-term),Oxidation (M),Phospho (STY),...,Potential contaminant,id,Protein group IDs,Peptide ID,Mod. peptide ID,MS/MS IDs,Best MS/MS,AIF MS/MS IDs,Oxidation (M) site IDs,Phospho (STY) site IDs
Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAAAEKNVPLYK,12,[],_AAAAEKNVPLYK_,<__main__.SiteString object at 0x13a75c400>,<__main__.SiteString object at 0x1391ca128>,<__main__.SiteString object at 0x23d9e4710>,<__main__.SiteString object at 0x1e5ca1278>,[0],0,0,...,,1,1258,0,0,[nan],,,[nan],[nan]
AAAAEKNVPLYQHLADLSK,19,[],_AAAAEKNVPLYQHLADLSK_,<__main__.SiteString object at 0x13a75c4a8>,<__main__.SiteString object at 0x1391ca1d0>,<__main__.SiteString object at 0x23d9e47b8>,<__main__.SiteString object at 0x1e5ca11d0>,[0],0,0,...,,7,1393,1,1,[3],3,,[nan],[nan]
AAAAEKNVPLYQHLADLSK,19,[],_AAAAEKNVPLYQHLADLSK_,<__main__.SiteString object at 0x13a75c550>,<__main__.SiteString object at 0x1391ca278>,<__main__.SiteString object at 0x23d9e4860>,<__main__.SiteString object at 0x1e5ca10f0>,[0],0,0,...,,19,1393,1,1,[nan],,,[nan],[nan]
AAAAGAGGAGDSGDAVTK,18,[],_AAAAGAGGAGDSGDAVTK_,<__main__.SiteString object at 0x13a75c588>,<__main__.SiteString object at 0x1391ca2b0>,<__main__.SiteString object at 0x23d9e4898>,<__main__.SiteString object at 0x1e5ca10b8>,[0],0,0,...,,38,1889,2,2,[nan],,,[nan],[nan]
AAAALAGGK,9,[],_AAAALAGGK_,<__main__.SiteString object at 0x13a75c5f8>,<__main__.SiteString object at 0x1391ca320>,<__main__.SiteString object at 0x23d9e4908>,<__main__.SiteString object at 0x135242048>,[0],0,0,...,,43,1143,3,3,[23],23,,[nan],[nan]
AAAALAGGK,9,[],_AAAALAGGK_,<__main__.SiteString object at 0x13a75c668>,<__main__.SiteString object at 0x1391ca390>,<__main__.SiteString object at 0x23d9e4978>,<__main__.SiteString object at 0x1352420b8>,[0],0,0,...,,60,1143,3,3,[nan],,,[nan],[nan]
AAAALAGGKK,10,[],_AAAALAGGKK_,<__main__.SiteString object at 0x13a75c6d8>,<__main__.SiteString object at 0x1391ca400>,<__main__.SiteString object at 0x23d9e49e8>,<__main__.SiteString object at 0x135242128>,[0],0,0,...,,71,1143,4,4,[39],39,,[nan],[nan]
AAAALAGGKK,10,[],_AAAALAGGKK_,<__main__.SiteString object at 0x13a75c748>,<__main__.SiteString object at 0x1391ca470>,<__main__.SiteString object at 0x23d9e4a58>,<__main__.SiteString object at 0x135242198>,[0],0,0,...,,72,1143,4,4,"[40, 41]",41,,[nan],[nan]
AAAALAGGKK,10,[],_AAAALAGGKK_,<__main__.SiteString object at 0x13a75c7b8>,<__main__.SiteString object at 0x1391ca4e0>,<__main__.SiteString object at 0x23d9e4ac8>,<__main__.SiteString object at 0x135242208>,[0],0,0,...,,98,1143,4,4,[nan],,,[nan],[nan]
AAAALAGGKK,10,[],_AAAALAGGKK_,<__main__.SiteString object at 0x13a75c828>,<__main__.SiteString object at 0x1391ca550>,<__main__.SiteString object at 0x23d9e4b38>,<__main__.SiteString object at 0x135242278>,[0],0,0,...,,99,1143,4,4,[nan],,,[nan],[nan]


In [68]:
def parse_protein_groups(f):
    col_indicies = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21, 38, 39, 40, 41, 42, 43, 44, 45, 68, 69, 70, 71, 88, 89, 90, 91, 92, 93, 94, 95, 118, 119, 120, 121, 138, 139, 140, 141, 142, 143, 144, 145, 160, 161, 162, 163, 164, 165, 166, 167, 176, 177, 178, 179, 196, 197, 198, 199, 200, 201, 202, 203, 226, 227, 228, 229, 246, 247, 248, 249, 250, 251, 252, 253, 268, 277, 278, 279, 280, 297, 298, 299, 300, 301, 302, 303, 304, 327, 328, 329, 330, 347, 348, 349, 350, 351, 352, 353, 354, 377, 378, 379, 380, 397, 398, 399, 400, 401, 402, 403, 404, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433]
    df = pandas.read_table(f, usecols=col_indicies)
    listify(df, [
            'Protein IDs',
            'Majority protein IDs',
            'Peptide counts (all)',
            'Peptide counts (razor+unique)',
            'Fasta headers',
            'Peptide is razor',
            'Mod. peptide IDs',
            'Evidence IDs',
            'MS/MS IDs',
            'Oxidation (M) site IDs',
            'Phospho (STY) site IDs',
            'Oxidation (M) site positions',
            'Phospho (STY) site positions'
        ], delim=';')
    # TODO: fix extra list columns
    return df

In [74]:
protein_groups = parse_protein_groups(protein_groups_file)
print(protein_groups.dtypes)
protein_groups

Protein IDs                              object
Majority protein IDs                     object
Peptide counts (all)                     object
Peptide counts (razor+unique)            object
Peptide counts (unique)                  object
Fasta headers                            object
Number of proteins                        int64
Peptides                                  int64
Razor + unique peptides                   int64
Unique peptides                           int64
Peptides Control_Ub                       int64
Peptides Control_UbP                      int64
Peptides Control_WCL                      int64
Peptides Control_WCLP                     int64
Peptides Pynd_5FC_Ub                      int64
Peptides Pynd_5FC_UbP                     int64
Peptides Pynd_5FC_WCL                     int64
Peptides Pynd_5FC_WCLP                    int64
Peptides Pynd_AlkKO_Ub                    int64
Peptides Pynd_AlkKO_UbP                   int64
Peptides Pynd_AlkKO_WCL                 

  data = self._reader.read(nrows)


Unnamed: 0,Protein IDs,Majority protein IDs,Peptide counts (all),Peptide counts (razor+unique),Peptide counts (unique),Fasta headers,Number of proteins,Peptides,Razor + unique peptides,Unique peptides,...,Peptide IDs,Peptide is razor,Mod. peptide IDs,Evidence IDs,MS/MS IDs,Best MS/MS,Oxidation (M) site IDs,Phospho (STY) site IDs,Oxidation (M) site positions,Phospho (STY) site positions
0,CON__A2AB72,CON__A2AB72,1,1,1,>A2AB72 TREMBL:A2AB72 Tax_Id=10090 Gene_Symbol...,1,1,1,1,...,16687,True,18612,258339,135445,135445,,0;1;7687;9993,,305;306;307;313
1,CON__ENSEMBL:ENSBTAP00000023402,CON__ENSEMBL:ENSBTAP00000023402,1,1,1,>ENSEMBL:ENSBTAP00000023402 (Bos taurus) 46 kD...,1,1,1,1,...,24636,True,27875,385721;385722;385723,204113,204113,,2;3;4;9994,,295;297;301;306
2,CON__P00761,CON__P00761,8,8,7,>P00761 SWISS-PROT:P00761|TRYP_PIG Trypsin - S...,1,8,8,7,...,8497;8498;9030;11741;11742;13051;19953;22927,True;True;True;True;True;True;True;True,9339;9340;9341;9342;9918;13001;13002;14440;225...,132159;132160;132161;132162;132163;132164;1321...,69949;69950;69951;69952;69953;69954;69955;6995...,69957;69979;74399;97924;97927;108901;162055;18...,0,,94,
3,CON__P02533;CON__A2A4G1;CON__P08779;CON__P0872...,CON__P02533;CON__A2A4G1;CON__P08779,3;2;2;1;1;1;1;1;1;1,1;0;0;1;1;0;1;0;0;0,1;0;0;1;1;0;1;0;0;0,>P02533 SWISS-PROT:P02533 Tax_Id=9606 Gene_Sym...,10,3,1,1,...,14059;24240;24241,True;False;False,15657;27438;27439;27440,221849;221850;221851;221852;221853;221854;2218...,117197;200285;200286;200287;200288;200289;200290,117197;200288;200290,1,,119,
4,CON__P02662,CON__P02662,1,1,1,>P02662 SWISS-PROT:P02662 Alpha-S1-casein - Bo...,1,1,1,1,...,25121,True,28412,394578;394579;394580;394581;394582;394583;3945...,208746;208747;208748;208749;208750;208751;208752,208746,,5,,115
5,CON__P02663,CON__P02663,1,1,1,>P02663 SWISS-PROT:P02663 Alpha-S2-casein [Con...,1,1,1,1,...,22607,True,25637,349807;349808;349809,184665,184665,2,6;7688,141,143;144
6,CON__P02666,CON__P02666,2,2,2,>P02666 SWISS-PROT:P02666 Beta-casein - Bos ta...,1,2,2,2,...,3996;5453,True;True,4412;5977,62185;62186;62187;62188;62189;62190;62191;6219...,32906;43265;43266;43267;43268;43269;43270;43271,32906;43270,,7,,35
7,CON__P02769,CON__P02769,25,25,21,>P02769 SWISS-PROT:P02769 (Bos taurus) Bovine ...,1,25,25,21,...,326;1994;2150;2168;3266;3401;4669;5189;7480;75...,True;True;True;True;True;True;True;True;True;T...,369;2251;2414;2432;3613;3764;5149;5701;8205;82...,6087;36083;37979;37980;38315;52390;52391;54187...,3472;19615;20730;20731;20901;28146;28147;28148...,3472;19615;20730;20901;28146;29030;37248;41421...,,,,
8,CON__P04264;CON__Q9R0H5;CON__Q6NXH9;CON__Q8BGZ...,CON__P04264,27;1;1;1;1,27;1;1;1;1,19;0;0;0;0,>P04264 SWISS-PROT:P04264 Tax_Id=9606 Gene_Sym...,5,27,27,19,...,289;290;5264;5265;6052;6098;6703;6704;7229;815...,True;True;True;True;True;True;True;True;True;T...,331;332;5778;5779;6616;6667;7340;7341;7934;895...,5489;5490;5491;5492;5493;5494;5495;5496;80421;...,3184;3185;3186;41967;41968;41969;41970;41971;4...,3185;3186;41972;41978;47752;48000;52590;52596;...,,8,,399
9,CON__Q3KNV1;CON__P08729,CON__Q3KNV1;CON__P08729,1;1,1;1,1;1,>Q3KNV1 TREMBL:Q3KNV1;Q96GE1 Tax_Id=9606 Gene_...,2,1,1,1,...,13036,True,14422,205598;205599;205600;205601,108786,108786,,9,,38
