In [61]:
import pandas as pd
from pyteomics.fasta import FASTA
from pyteomics import parser
import re

In [62]:
# prepare modification sites
site_file = r"C:\Users\Graham Delafield\code\python\citrullination\Tissue-specific Cit IDs-redundant.xlsx"
df = pd.read_excel(site_file, usecols=range(2))

sites = dict()

for i, row in df.iterrows():
    protein = row.Protein
    sites[protein] = sites.get(protein, [])
    sites[protein].append(row.Position)

In [69]:
proteins = r'.\CitrullinatedProteins.fasta'
idents = []
tryptic = []
gluc = []

for (header, sequence) in FASTA(proteins):
    accession = re.search(r'(\w*\|)(\w*)(\|[\w\s]*)', header).group(2)
    
    protein_sites = sites[accession]
    for site in protein_sites:
        begin = sequence[:site - 1]
        res = sequence[site-1].lower()
        end = sequence[site:]
        sequence = begin + res + end
    
    idents.append(accession)

    res = [tryptic, gluc]
    for i, enzyme in enumerate(['trypsin', r'[DE]']):
        peptides = parser.cleave(sequence, enzyme, missed_cleavages=1)

        good_matches = []
        for pep in peptides:
            if re.search(r'[a-z]', pep):
                if len(pep) > 4:
                    good_matches.append(pep)
        res[i].append(good_matches)


In [71]:
with pd.ExcelWriter(r'.\AlternativeDigestion.xlsx') as writer:

    pd.DataFrame({
        'protein': idents,
        'tryptic': tryptic,
        'glu-c': gluc
    }).to_excel(writer)