In [3]:
import pandas
import math

In [161]:
import re
class SiteString:
    """
    A peptide sequence with one or more amino acids mapped to a floating-point score
    """
    
    sequence = None
    sites = None # Make a dict later
    
    def __init__(self, string):
        """
        string: A string from the data file like 'AFVNHM(8.97)M(-8.97)SSHSNHPGKR'
        """
        self.sites = {}
        pattern = '\\((-?[\\d.]+)\\)'
        offset = 0
        for match in re.finditer(pattern, string):
            self.sites[match.start() + offset] = float(match.group(1))
            offset -= len(match.group(0))
        self.sequence = re.sub(pattern, '', string)

        
# Test scored site parser
import pytest
test_sites = SiteString('AFVNHM(8.97)MM(-8.97)SSHSNH(1.0)PGKR')
assert 'AFVNHMMMSSHSNHPGKR' == test_sites.sequence
assert {6: 8.97, 8: -8.97, 14: 1.0} == test_sites.sites
test_sites_2 = SiteString('')
assert '' == test_sites_2.sequence
assert {} == test_sites_2.sites


def parse_evidence(file):
    
    def split(cols, delim=';'):
        for col in cols:
            try:
                df[col] = [[st.strip() for st in str(s).split(delim)] for s in df[col]]
            except ValueError as e:
                raise ValueError("Failed to convert column %s to list" % col) from e
    
    def floatify(cols):
        for col in cols:
            try:
                df[col] = [float(s) for s in df[col]]
            except ValueError as e:
                raise ValueError("Failed to convert column %s to a float" % col) from e
    
    def siteify(cols):
        for col in cols:
            try:
                df[col] = [SiteString(str(s)) for s in df[col]]
            except ValueError as e:
                raise ValueError("Failed to convert column %s to a site string" % col) from e
    
    
    df = pandas.read_table(file, header=0, index_col=0)
    #print(df.columns)
    split([
            'Protein Group IDs',
            'MS/MS IDs',
            'Oxidation (M) Site IDs',
            'Phospho (STY) Site IDs',
            'Modifications',
            'Acetyl (Protein N-term)',
            'Proteins',
            'Leading Proteins',
            'Gene Names',
            'Protein Names'
            ])
    split(['Protein Descriptions'], delim='|')
    siteify([
            'Oxidation (M) Score Diffs',
            'Phospho (STY) Score Diffs',
            'Phospho (STY) Probabilities',
            'Oxidation (M) Probabilities'
    ])
    floatify(['PEP']) # Some contain "Infinity", which float() understands but Pandas doesn't
    
    def simplify(lst):
        if lst == ['Unmodified']:
            return []
        return lst
    df['Modifications'] = [simplify(s) for s in df['Modifications']]
    return df
    
df = parse_evidence('evidence.txt')
print(df.dtypes)
print(df.loc[4]) # for example

Protein Group IDs                       object
Peptide ID                               int64
Mod. Peptide ID                          int64
MS/MS IDs                               object
AIF MS/MS IDs                          float64
Oxidation (M) Site IDs                  object
Phospho (STY) Site IDs                  object
Sequence                                object
Length                                   int64
Modifications                           object
Modified Sequence                       object
Oxidation (M) Probabilities             object
Phospho (STY) Probabilities             object
Oxidation (M) Score Diffs               object
Phospho (STY) Score Diffs               object
Acetyl (Protein N-term)                 object
Oxidation (M)                            int64
Phospho (STY)                            int64
Proteins                                object
Leading Proteins                        object
Leading Razor Protein                   object
Gene Names   