In [3]:
import pandas
import math

In [165]:
import re
class SiteString:
    """
    A peptide sequence with one or more amino acids mapped to a floating-point score
    """
    
    sequence = None
    sites = None # Make a dict later
    
    def __init__(self, string):
        """
        string: A string from the data file like 'AFVNHM(8.97)M(-8.97)SSHSNHPGKR'
        """
        self.sites = {}
        pattern = '\\((-?[\\d.]+)\\)'
        offset = 0
        for match in re.finditer(pattern, string):
            self.sites[match.start() + offset] = float(match.group(1))
            offset -= len(match.group(0))
        self.sequence = re.sub(pattern, '', string)

        
# Test scored site parser
import pytest
test_sites = SiteString('AFVNHM(8.97)MM(-8.97)SSHSNH(1.0)PGKR')
assert 'AFVNHMMMSSHSNHPGKR' == test_sites.sequence
assert {6: 8.97, 8: -8.97, 14: 1.0} == test_sites.sites
test_sites_2 = SiteString('')
assert '' == test_sites_2.sequence
assert {} == test_sites_2.sites


def parse_evidence(file):
    
    def convert(cols, fn):
        for col in cols:
            try:
                df[col] = [fn(s) for s in df[col]]
            except ValueError as e:
                raise ValueError("Failed to convert column %s" % col) from e
    
    def split(cols, delim=';'):
        """Transform column type to list"""
        convert(cols, lambda s: [st.strip() for st in str(s).split(delim)])
    
    def floatify(cols):
        """Transform column type to float"""
        convert(cols, lambda: float(s))
    
    def siteify(cols):
        """Transform column type to SiteString"""
        convert(cols, lambda: SiteString(str(s)))
    
    
    df = pandas.read_table(file, header=0, index_col=0)

    split([
            'Protein Group IDs',
            'MS/MS IDs',
            'Oxidation (M) Site IDs',
            'Phospho (STY) Site IDs',
            'Modifications',
            'Acetyl (Protein N-term)',
            'Proteins',
            'Leading Proteins',
            'Gene Names',
            'Protein Names'
            ])
    split(['Protein Descriptions'], delim='|')
    siteify([
            'Oxidation (M) Score Diffs',
            'Phospho (STY) Score Diffs',
            'Phospho (STY) Probabilities',
            'Oxidation (M) Probabilities'
    ])
    
    # Some contain "Infinity", which float() understands but Pandas doesn't
    floatify(['PEP'])
    
    # Just use an empty list, not a list containing "Unmodified"
    df['Modifications'] = [[] if lst == ['Unmodified'] else lst for lst in df['Modifications']]

    return df
    
df = parse_evidence('evidence.txt')
print(df.dtypes)
print(df.loc[4]) # for example

TypeError: <lambda>() takes 0 positional arguments but 1 was given