# In silico digestion of bovine and camel casein protein sequences using chymosin and plasmin enzymes
The program will recognise any protein AA sequences as fasta files.

__files needed for bovine and camel caseins:__
- Camel casein AA sequences: file “Camel_Casein_Sequences_2024-10-31.fasta”
- Cow casein AA sequences: file “Cow_Casein_Sequences_2024-10-31.fasta”
- Camel alpha and beta AA sequences: file "Camel_Alpha_Beta_Casein_Sequences_2024-10-31.fasta"
- Cow alpha and beta AA sequences: file "Cow_Alpha_Beta_Casein_Sequences_2024-10-31.fasta"
- Camel kappa AA sequences: file "Camel_Kappa_Casein_Sequences_2024-10-31.fasta"
- Cow kappa AA sequences: file "Cow_Kappa_Casein_Sequences_2024-10-31.fasta"

## Install and load libraries 

In [6]:
# install library
# !pip install pyteomics

Defaulting to user installation because normal site-packages is not writeable
Collecting pyteomics
  Obtaining dependency information for pyteomics from https://files.pythonhosted.org/packages/fb/62/b5d706255739553398d3a308d92e2476d5a363ad3ca0598c51bc75cc5054/pyteomics-4.7.5-py3-none-any.whl.metadata
  Downloading pyteomics-4.7.5-py3-none-any.whl.metadata (6.5 kB)
Downloading pyteomics-4.7.5-py3-none-any.whl (238 kB)
   ---------------------------------------- 0.0/239.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/239.0 kB ? eta -:--:--
   ------ -------------------------------- 41.0/239.0 kB 487.6 kB/s eta 0:00:01
   ----------------------------------- ---- 215.0/239.0 kB 1.9 MB/s eta 0:00:01
   ---------------------------------------- 239.0/239.0 kB 1.8 MB/s eta 0:00:00
Installing collected packages: pyteomics
Successfully installed pyteomics-4.7.5


In [13]:
# load libraries
from pyteomics import fasta, parser
import pandas as pd
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)  # Disable column wrapping when using the "display" command
from IPython.display import clear_output

## Perform digestion using trypsin to test code 

In [45]:
### Use this for pre-defined enzymes
# Define the path to your FASTA files
camel_caseins = 'Camel_Casein_Sequences_2024-10-31.fasta'
cow_caseins = 'Cow_Casein_Sequences_2024-10-31.fasta'

# Define the enzyme or cleavage rule (only applicable if enzyme is already listed in package such as trypsin)
enzyme = 'trypsin'  # Other options include 'chymotrypsin', 'lys-c', etc.

# Specify other parameters if needed
missed_cleavages = 2  # Allows for a specified number of missed cleavages

# Amino acid masses (approximate, in Daltons)
aa_masses = {
    'A': 71.08, 'R': 156.19, 'N': 114.11, 'D': 115.09, 'C': 103.15,
    'E': 129.12, 'Q': 128.13, 'G': 57.05, 'H': 137.14, 'I': 113.16,
    'L': 113.16, 'K': 128.17, 'M': 131.19, 'F': 147.18, 'P': 97.12,
    'S': 87.08, 'T': 101.11, 'W': 186.21, 'Y': 163.18, 'V': 99.13
}

# Function to calculate molecular weight, skipping unknown amino acids
def calculate_mw(sequence):
    return sum(aa_masses.get(aa, 0) for aa in sequence)

# Initialize lists to store peptide information for camel and cow caseins
peptide_data_camel = []
peptide_data_cow = []

# Process camel caseins
with fasta.read(camel_caseins) as fasta_sequences:
    for description, sequence in fasta_sequences:
        # Calculate the molecular weight of the full protein
        protein_mw = calculate_mw(sequence)
        
        peptides = parser.cleave(sequence, parser.expasy_rules[enzyme], missed_cleavages=missed_cleavages)
        current_position = 0
        for peptide in peptides:
            peptide_len = len(peptide)
            if peptide_len >= 2:
                start_position = sequence.find(peptide, current_position)
                end_position = start_position + peptide_len - 1
                current_position = start_position + peptide_len
                
                # Calculate peptide MW
                peptide_mw = calculate_mw(peptide)
                
                # Append the data to our list
                peptide_data_camel.append({
                    'protein_name': description,
                    'protein_MW': protein_mw,
                    'enzyme': enzyme,
                    'peptide_seq': peptide,
                    'peptide_MW': peptide_mw,
                    'peptide_len': peptide_len,
                    'peptide_start': start_position,
                    'peptide_end': end_position 
                })

# Process cow caseins
with fasta.read(cow_caseins) as fasta_sequences:
    for description, sequence in fasta_sequences:
        protein_mw = calculate_mw(sequence)
        
        peptides = parser.cleave(sequence, parser.expasy_rules[enzyme], missed_cleavages=missed_cleavages)
        current_position = 0
        for peptide in peptides:
            peptide_len = len(peptide)
            if peptide_len >= 2:
                start_position = sequence.find(peptide, current_position)
                end_position = start_position + peptide_len - 1
                current_position = start_position + peptide_len
                
                peptide_mw = calculate_mw(peptide)
                
                peptide_data_cow.append({
                    'protein_name': description,
                    'protein_MW': protein_mw,
                    'enzyme': enzyme,
                    'peptide_seq': peptide,
                    'peptide_MW': peptide_mw,
                    'peptide_len': peptide_len,
                    'peptide_start': start_position,
                    'peptide_end': end_position                    
                })

# Create DataFrames for camel and cow casein peptides
peptides_camel_trypsin = pd.DataFrame(peptide_data_camel)
peptides_cow_trypsin = pd.DataFrame(peptide_data_cow)

# Display the results
print("peptides_camel_trypsin:")
display(peptides_camel_trypsin)

print("peptides_cow_trypsin:")
display(peptides_cow_trypsin)



peptides_camel_trypsin:


Unnamed: 0,protein_name,protein_MW,enzyme,peptide_seq,peptide_MW,peptide_len,peptide_start,peptide_end
0,sp|O97943|CASA1_CAMDR Alpha-S1-casein OS=Camel...,26843.69,trypsin,INEDNHPQLGEPVKVVTQEQAYFHLEPFPQFFQLGASPYVAWYYPP...,8037.96,71,151,221
1,sp|O97943|CASA1_CAMDR Alpha-S1-casein OS=Camel...,26843.69,trypsin,RKILELAVVSPIQFR,1751.16,15,-1,13
2,sp|O97943|CASA1_CAMDR Alpha-S1-casein OS=Camel...,26843.69,trypsin,YPLR,529.65,4,18,21
3,sp|O97943|CASA1_CAMDR Alpha-S1-casein OS=Camel...,26843.69,trypsin,KILELAVVSPIQFRQENIDELK,2565.03,22,41,62
4,sp|O97943|CASA1_CAMDR Alpha-S1-casein OS=Camel...,26843.69,trypsin,QENIDELKDTRNEPTEDHIMEDTER,3040.24,25,-1,23
...,...,...,...,...,...,...,...,...
465,tr|W0K8B9|W0K8B9_CAMDR Kappa-casein (Fragment)...,17138.01,trypsin,TVKYFPIQFVQSRYPSYGINYYQHR,3137.59,25,-1,23
466,tr|W0K8B9|W0K8B9_CAMDR Kappa-casein (Fragment)...,17138.01,trypsin,CCEKVERLLNEK,1445.75,12,-1,10
467,tr|W0K8B9|W0K8B9_CAMDR Kappa-casein (Fragment)...,17138.01,trypsin,YPSYGINYYQHRLAVPINNQFIPYPNYAKPVAIR,4023.68,34,25,58
468,tr|W0K8B9|W0K8B9_CAMDR Kappa-casein (Fragment)...,17138.01,trypsin,YPSYGINYYQHR,1542.70,12,-1,10


peptides_cow_trypsin:


Unnamed: 0,protein_name,protein_MW,enzyme,peptide_seq,peptide_MW,peptide_len,peptide_start,peptide_end
0,bovine aS1CN A (P02662),21411.28,trypsin,YLGYLEQLLRLKK,1618.99,13,77,89
1,bovine aS1CN A (P02662),21411.28,trypsin,LHSMK,596.74,5,106,110
2,bovine aS1CN A (P02662),21411.28,trypsin,YLGYLEQLLR,1249.49,10,-1,8
3,bovine aS1CN A (P02662),21411.28,trypsin,VPQLEIVPNSAEERLHSMKEGIHAQQK,3051.49,27,92,118
4,bovine aS1CN A (P02662),21411.28,trypsin,HIQKEDVPSERYLGYLEQLLR,2568.94,21,-1,19
...,...,...,...,...,...,...,...,...
1665,bovine proteose peptone component 8-slow (29-105),8618.99,trypsin,IHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMG...,6156.36,57,-1,55
1666,bovine proteose peptone component 8-slow (29-105),8618.99,trypsin,FQSEEQQQTEDELQDK,1964.01,16,-1,14
1667,bovine proteose peptone component 8-slow (29-105),8618.99,trypsin,FQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLT...,7492.61,67,-1,65
1668,bovine proteose peptone component 8-slow (29-105),8618.99,trypsin,KIEK,498.62,4,-1,2


## Perform digestion using plasmin
Plasmin is a proteolytic enzyme that cleaves specifically at the C-terminal side of lysine (K) and arginine (R) residues, similar to trypsin, but it has additional flexibility in its cleavage patterns.
The regular expression for plasmin’s cleavage rule can be defined as follows:
Cleaves after lysine (K) or arginine (R),
- Does not cleave if the following amino acid is proline (P).

Thus, the regular expression for plasmin would be:

_python code:_ 
plasmin_rule = r'[KR](?!P)'

_Explanation:_
[KR] matches either K or R.
(?!P) is a negative lookahead that ensures the next amino acid is not proline (P).

In [15]:
### use this for novel enzymes  Plasmin
# Define the path to your FASTA files
# only alpha and beta caseins
camel_alpha_beta_caseins = 'Camel_Alpha_Beta_Casein_Sequences_2024-10-31.fasta'
cow_alpha_beta_caseins = 'Cow_Alpha_Beta_Casein_Sequences_2024-10-31.fasta'

# Define custom cleavage rule as a regular expression for plasmin
plasmin = r'[KR](?!P)'  # Cleaves after K or R unless followed by P

# Specify other parameters if needed
missed_cleavages = 2  # Allows for a specified number of missed cleavages

# Amino acid masses (approximate, in Daltons)
aa_masses = {
    'A': 71.08, 'R': 156.19, 'N': 114.11, 'D': 115.09, 'C': 103.15,
    'E': 129.12, 'Q': 128.13, 'G': 57.05, 'H': 137.14, 'I': 113.16,
    'L': 113.16, 'K': 128.17, 'M': 131.19, 'F': 147.18, 'P': 97.12,
    'S': 87.08, 'T': 101.11, 'W': 186.21, 'Y': 163.18, 'V': 99.13
}

# Function to perform accurate cleavage with start and end tracking
def accurate_cleavage(sequence, cleavage_rule, enzyme_name, protein_mw, data_storage, protein_name):
    import re
    # Initialize start position
    current_position = 0
    # Find all cleavage positions according to the rule
    for match in re.finditer(cleavage_rule, sequence):
        # Define end of the peptide based on the match
        end_position = match.end()
        peptide = sequence[current_position:end_position]
        peptide_len = len(peptide)
        
        # Only consider peptides with at least 4 amino acids
        if peptide_len >= 2:
            peptide_mw = calculate_mw(peptide)
            # Append peptide data
            data_storage.append({
                'protein_name': protein_name,
                'protein_seq': sequence,
                'protein_MW': protein_mw,
                'enzyme': enzyme_name,
                'peptide_seq': peptide,
                'peptide_MW': peptide_mw,
                'peptide_len': peptide_len,
                'peptide_start': current_position,
                'peptide_end': end_position - 1  # End is inclusive
            })
        
        # Move to the next position after the cleavage site
        current_position = end_position

    # Capture the last peptide if it extends to the end of the sequence
    if current_position < len(sequence):
        peptide = sequence[current_position:]
        peptide_len = len(peptide)
        if peptide_len >= 2:
            peptide_mw = calculate_mw(peptide)
            data_storage.append({
                'protein_name': protein_name,
                'protein_seq': sequence,
                'protein_MW': protein_mw,
                'enzyme': enzyme_name,
                'peptide_seq': peptide,
                'peptide_MW': peptide_mw,
                'peptide_len': peptide_len,
                'peptide_start': current_position,
                'peptide_end': len(sequence) - 1
            })

# Plasmin enzyme processing
# Initialize lists to store peptide information for camel and cow caseins
peptide_data_camel = []
peptide_data_cow = []

# Process camel caseins with plasmin
with fasta.read(camel_alpha_beta_caseins) as fasta_sequences:
    for description, sequence in fasta_sequences:
        protein_mw = calculate_mw(sequence)
        accurate_cleavage(sequence, plasmin, 'plasmin', protein_mw, peptide_data_camel, description)

# Process cow caseins with plasmin
with fasta.read(cow_alpha_beta_caseins) as fasta_sequences:
    for description, sequence in fasta_sequences:
        protein_mw = calculate_mw(sequence)
        accurate_cleavage(sequence, plasmin, 'plasmin', protein_mw, peptide_data_cow, description)

# Create DataFrames for camel and cow casein peptides with plasmin
peptides_camel_plasmin = pd.DataFrame(peptide_data_camel)
peptides_cow_plasmin = pd.DataFrame(peptide_data_cow)

# Display the results for plasmin
print("peptides_camel_plasmin:")
display(peptides_camel_plasmin)

print("peptides_cow_plasmin:")
display(peptides_cow_plasmin)


peptides_camel_plasmin:


Unnamed: 0,protein_name,protein_seq,protein_MW,enzyme,peptide_seq,peptide_MW,peptide_len,peptide_start,peptide_end
0,sp|O97943|CASA1_CAMDR Alpha-S1-casein OS=Camel...,MKLLILTCLVAVALARPKYPLRYPEVFQNEPDSIEEVLNKRKILEL...,26843.69,plasmin,MK,259.36,2,0,1
1,sp|O97943|CASA1_CAMDR Alpha-S1-casein OS=Camel...,MKLLILTCLVAVALARPKYPLRYPEVFQNEPDSIEEVLNKRKILEL...,26843.69,plasmin,LLILTCLVAVALARPK,1676.20,16,2,17
2,sp|O97943|CASA1_CAMDR Alpha-S1-casein OS=Camel...,MKLLILTCLVAVALARPKYPLRYPEVFQNEPDSIEEVLNKRKILEL...,26843.69,plasmin,YPLR,529.65,4,18,21
3,sp|O97943|CASA1_CAMDR Alpha-S1-casein OS=Camel...,MKLLILTCLVAVALARPKYPLRYPEVFQNEPDSIEEVLNKRKILEL...,26843.69,plasmin,YPEVFQNEPDSIEEVLNK,2132.35,18,22,39
4,sp|O97943|CASA1_CAMDR Alpha-S1-casein OS=Camel...,MKLLILTCLVAVALARPKYPLRYPEVFQNEPDSIEEVLNKRKILEL...,26843.69,plasmin,ILELAVVSPIQFR,1466.80,13,42,54
...,...,...,...,...,...,...,...,...,...
123,tr|A0A5N4BX35|A0A5N4BX35_CAMDR Beta-casein (Fr...,DLRVMKVLILACLVALALAREKEEFKTAGEALESISSSEESITHIN...,29087.54,plasmin,VLPVPQQMVPYPQR,1633.98,14,189,202
124,tr|A0A5N4BX35|A0A5N4BX35_CAMDR Beta-casein (Fr...,DLRVMKVLILACLVALALAREKEEFKTAGEALESISSSEESITHIN...,29087.54,plasmin,AMPVQAVLPFQEPVPDPVR,2072.47,19,203,221
125,tr|A0A5N4BX35|A0A5N4BX35_CAMDR Beta-casein (Fr...,DLRVMKVLILACLVALALAREKEEFKTAGEALESISSSEESITHIN...,29087.54,plasmin,GLHPVPQPLVPVIVSADLLTVLFHSR,2787.36,26,222,247
126,tr|A0A5N4BX35|A0A5N4BX35_CAMDR Beta-casein (Fr...,DLRVMKVLILACLVALALAREKEEFKTAGEALESISSSEESITHIN...,29087.54,plasmin,CVYDVR,735.87,6,248,253


peptides_cow_plasmin:


Unnamed: 0,protein_name,protein_seq,protein_MW,enzyme,peptide_seq,peptide_MW,peptide_len,peptide_start,peptide_end
0,bovine aS1CN A (P02662),RPKHPIKHQGLPQPFPEVFGKEKVNELSKDIGSESTEDQAMEDIKQ...,21411.28,plasmin,RPK,381.48,3,0,2
1,bovine aS1CN A (P02662),RPKHPIKHQGLPQPFPEVFGKEKVNELSKDIGSESTEDQAMEDIKQ...,21411.28,plasmin,HPIK,475.59,4,3,6
2,bovine aS1CN A (P02662),RPKHPIKHQGLPQPFPEVFGKEKVNELSKDIGSESTEDQAMEDIKQ...,21411.28,plasmin,HQGLPQPFPEVFGK,1562.80,14,7,20
3,bovine aS1CN A (P02662),RPKHPIKHQGLPQPFPEVFGKEKVNELSKDIGSESTEDQAMEDIKQ...,21411.28,plasmin,EK,257.29,2,21,22
4,bovine aS1CN A (P02662),RPKHPIKHQGLPQPFPEVFGKEKVNELSKDIGSESTEDQAMEDIKQ...,21411.28,plasmin,VNELSK,670.77,6,23,28
...,...,...,...,...,...,...,...,...,...
416,bovine bCN I (P02666),RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQ...,23547.49,plasmin,YPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPP...,6344.38,56,113,168
417,bovine bCN I (P02666),RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQ...,23547.49,plasmin,VLPVPQK,761.96,7,169,175
418,bovine bCN I (P02666),RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQ...,23547.49,plasmin,AVPYPQR,811.95,7,176,182
419,bovine bCN I (P02666),RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQ...,23547.49,plasmin,DMPIQAFLLYQEPVLGPVR,2168.60,19,183,201


## Perform digestion using chymosin:
Chymosin, also known as rennin, is a protease that cleaves specifically at the C-terminal side of phenylalanine (F), leucine (L), and tyrosine (Y) residues. However, it has a preference for cleaving after phenylalanine in the Phe105-Met106 bond in κ-casein, which is a unique property but not always generalizable to all proteins.
The regular expression for chymosin’s cleavage rule can be defined as follows:
- Cleaves after phenylalanine (F), leucine (L), or tyrosine (Y).

Thus, the regular expression for chymosin would be:

_python code:_
chymosin_rule = r'[FLY]'

_Explanation:_
[FLY] matches either F, L, or Y as the cleavage sites.

In [17]:
### use this for novel enzymes Chymosin
# Define the path to your FASTA files
# only kappa caseins
camel_kappa_caseins = 'Camel_Kappa_Casein_Sequences_2024-10-31.fasta'
cow_kappa_caseins = 'Cow_Kappa_Casein_Sequences_2024-10-31.fasta'

# Define custom cleavage rules as regular expressions for chymosin
chymosin = r'[FLY]'  # Cleaves after F, L, or Y

# Initialize lists to store peptide information for camel and cow caseins
peptide_data_camel = []
peptide_data_cow = []

# Function to perform accurate cleavage with start and end tracking
def accurate_cleavage(sequence, cleavage_rule, enzyme_name, protein_mw, data_storage, protein_name):
    import re
    # Initialize start position
    current_position = 0
    # Find all cleavage positions according to the rule
    for match in re.finditer(cleavage_rule, sequence):
        # Define end of the peptide based on the match
        end_position = match.end()
        peptide = sequence[current_position:end_position]
        peptide_len = len(peptide)
        
        # Only consider peptides with at least 4 amino acids
        if peptide_len >= 2:
            peptide_mw = calculate_mw(peptide)
            # Append peptide data
            data_storage.append({
                'protein_name': protein_name,
                'protein_seq': sequence,
                'protein_MW': protein_mw,
                'enzyme': enzyme_name,
                'peptide_seq': peptide,
                'peptide_MW': peptide_mw,
                'peptide_len': peptide_len,
                'peptide_start': current_position,
                'peptide_end': end_position - 1  # End is inclusive
            })
        
        # Move to the next position after the cleavage site
        current_position = end_position

    # Capture the last peptide if it extends to the end of the sequence
    if current_position < len(sequence):
        peptide = sequence[current_position:]
        peptide_len = len(peptide)
        if peptide_len >= 2:
            peptide_mw = calculate_mw(peptide)
            data_storage.append({
                'protein_name': protein_name,
                'protein_seq': sequence,
                'protein_MW': protein_mw,
                'enzyme': enzyme_name,
                'peptide_seq': peptide,
                'peptide_MW': peptide_mw,
                'peptide_len': peptide_len,
                'peptide_start': current_position,
                'peptide_end': len(sequence) - 1
            })

# chymosin_specific enzyme processing for camel caseins (Phe97-Ile98)
with fasta.read(camel_kappa_caseins) as fasta_sequences:
    for description, sequence in fasta_sequences:
        protein_mw = calculate_mw(sequence)
        accurate_cleavage(sequence, chymosin, 'chymosin', protein_mw, peptide_data_camel, description)

# chymosin_specific enzyme processing for cow caseins (Phe105-Met106)
with fasta.read(cow_kappa_caseins) as fasta_sequences:
    for description, sequence in fasta_sequences:
        protein_mw = calculate_mw(sequence)
        accurate_cleavage(sequence, chymosin, 'chymosin', protein_mw, peptide_data_cow, description)

# Create DataFrames for camel and cow casein peptides with chymosin_specific
peptides_camel_chymosin = pd.DataFrame(peptide_data_camel)
peptides_cow_chymosin = pd.DataFrame(peptide_data_cow)

# Display the results for chymosin_specific
print("peptides_camel_chymosin:")
display(peptides_camel_chymosin)

print("peptides_cow_chymosin:")
display(peptides_cow_chymosin)

peptides_camel_chymosin:


Unnamed: 0,protein_name,protein_seq,protein_MW,enzyme,peptide_seq,peptide_MW,peptide_len,peptide_start,peptide_end
0,sp|P79139|CASK_CAMDR Kappa-casein OS=Camelus d...,MKSFFLVVTILALTLPFLGAEVQNQEQPTCFEKVERLLNEKTVKYF...,20399.89,chymosin,MKSF,493.62,4,0,3
1,sp|P79139|CASK_CAMDR Kappa-casein OS=Camelus d...,MKSFFLVVTILALTLPFLGAEVQNQEQPTCFEKVERLLNEKTVKYF...,20399.89,chymosin,VVTIL,525.69,5,6,10
2,sp|P79139|CASK_CAMDR Kappa-casein OS=Camelus d...,MKSFFLVVTILALTLPFLGAEVQNQEQPTCFEKVERLLNEKTVKYF...,20399.89,chymosin,AL,184.24,2,11,12
3,sp|P79139|CASK_CAMDR Kappa-casein OS=Camelus d...,MKSFFLVVTILALTLPFLGAEVQNQEQPTCFEKVERLLNEKTVKYF...,20399.89,chymosin,TL,214.27,2,13,14
4,sp|P79139|CASK_CAMDR Kappa-casein OS=Camelus d...,MKSFFLVVTILALTLPFLGAEVQNQEQPTCFEKVERLLNEKTVKYF...,20399.89,chymosin,PF,244.30,2,15,16
...,...,...,...,...,...,...,...,...,...
59,tr|W0K8B9|W0K8B9_CAMDR Kappa-casein (Fragment)...,CCEKVERLLNEKTVKYFPIQFVQSRYPSYGINYYQHRLAVPINNQF...,17138.01,chymosin,AKPVAIRL,849.09,8,52,59
60,tr|W0K8B9|W0K8B9_CAMDR Kappa-casein (Fragment)...,CCEKVERLLNEKTVKYFPIQFVQSRYPSYGINYYQHRLAVPINNQF...,17138.01,chymosin,HAQIPQCQAL,1090.28,10,60,69
61,tr|W0K8B9|W0K8B9_CAMDR Kappa-casein (Fragment)...,CCEKVERLLNEKTVKYFPIQFVQSRYPSYGINYYQHRLAVPINNQF...,17138.01,chymosin,PNIDPPTVERRPRPRPSF,2113.46,18,70,87
62,tr|W0K8B9|W0K8B9_CAMDR Kappa-casein (Fragment)...,CCEKVERLLNEKTVKYFPIQFVQSRYPSYGINYYQHRLAVPINNQF...,17138.01,chymosin,IAIPPKKTQDKTVNPAINTVATVEPPVIPTAEPAVNTVVIAEASSEF,4851.65,47,88,134


peptides_cow_chymosin:


Unnamed: 0,protein_name,protein_seq,protein_MW,enzyme,peptide_seq,peptide_MW,peptide_len,peptide_start,peptide_end
0,bovine kCN A (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18956.70,chymosin,QEQNQEQPIRCEKDERF,2159.36,17,0,16
1,bovine kCN A (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18956.70,chymosin,SDKIAKY,805.93,7,18,24
2,bovine kCN A (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18956.70,chymosin,IPIQY,614.75,5,25,29
3,bovine kCN A (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18956.70,chymosin,VL,212.29,2,30,31
4,bovine kCN A (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18956.70,chymosin,SRY,406.45,3,32,34
...,...,...,...,...,...,...,...,...,...
211,bovine kCN J (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18993.85,chymosin,QWQVL,654.76,5,74,78
212,bovine kCN J (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18993.85,chymosin,SNTVPAKSCQAQPTTMARHPHPHL,2591.99,24,79,102
213,bovine kCN J (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18993.85,chymosin,SF,234.26,2,103,104
214,bovine kCN J (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18993.85,chymosin,MAIPPKKNQDKTEIPTINTIASGEPTSTPTIEAVESTVATL,4277.93,41,105,145


## Perform digestion using chymosin and very specific rules
- Phe105-Met106 bond in cow - kappa-casein
- Phe97-Ile98 bond in camel - kappa-casein


In [19]:
# Using very specific rule for chymosin:
# Define the path to your FASTA files
# only kappa caseins
camel_kappa_caseins = 'Camel_Kappa_Casein_Sequences_2024-10-31.fasta'
cow_kappa_caseins = 'Cow_Kappa_Casein_Sequences_2024-10-31.fasta'

# Define custom cleavage rules as regular expressions for chymosin_specific
# Phe105-Met106 for cow caseins
chymosin_specific_cow = r'F(?=M)'
# Phe97-Ile98 for camel caseins
chymosin_specific_camel = r'F(?=I)'

# Initialize lists to store peptide information for camel and cow caseins
peptide_data_camel = []
peptide_data_cow = []

# Function to perform accurate cleavage with start and end tracking
def accurate_cleavage(sequence, cleavage_rule, enzyme_name, protein_mw, data_storage, protein_name):
    import re
    # Initialize start position
    current_position = 0
    # Find all cleavage positions according to the rule
    for match in re.finditer(cleavage_rule, sequence):
        # Define end of the peptide based on the match
        end_position = match.end()
        peptide = sequence[current_position:end_position]
        peptide_len = len(peptide)
        
        # Only consider peptides with at least 4 amino acids
        if peptide_len >= 2:
            peptide_mw = calculate_mw(peptide)
            # Append peptide data
            data_storage.append({
                'protein_name': protein_name,
                'protein_seq': sequence,
                'protein_MW': protein_mw,
                'enzyme': enzyme_name,
                'peptide_seq': peptide,
                'peptide_MW': peptide_mw,
                'peptide_len': peptide_len,
                'peptide_start': current_position,
                'peptide_end': end_position - 1  # End is inclusive
            })
        
        # Move to the next position after the cleavage site
        current_position = end_position

    # Capture the last peptide if it extends to the end of the sequence
    if current_position < len(sequence):
        peptide = sequence[current_position:]
        peptide_len = len(peptide)
        if peptide_len >= 2:
            peptide_mw = calculate_mw(peptide)
            data_storage.append({
                'protein_name': protein_name,
                'protein_seq': sequence,
                'protein_MW': protein_mw,
                'enzyme': enzyme_name,
                'peptide_seq': peptide,
                'peptide_MW': peptide_mw,
                'peptide_len': peptide_len,
                'peptide_start': current_position,
                'peptide_end': len(sequence) - 1
            })

# chymosin_specific enzyme processing for camel caseins (Phe97-Ile98)
with fasta.read(camel_kappa_caseins) as fasta_sequences:
    for description, sequence in fasta_sequences:
        protein_mw = calculate_mw(sequence)
        accurate_cleavage(sequence, chymosin_specific_camel, 'chymosin_specific', protein_mw, peptide_data_camel, description)

# chymosin_specific enzyme processing for cow caseins (Phe105-Met106)
with fasta.read(cow_kappa_caseins) as fasta_sequences:
    for description, sequence in fasta_sequences:
        protein_mw = calculate_mw(sequence)
        accurate_cleavage(sequence, chymosin_specific_cow, 'chymosin_specific', protein_mw, peptide_data_cow, description)

# Create DataFrames for camel and cow casein peptides with chymosin_specific
peptides_camel_chymosin_specific = pd.DataFrame(peptide_data_camel)
peptides_cow_chymosin_specific = pd.DataFrame(peptide_data_cow)

# Display the results for chymosin_specific
print("peptides_camel_chymosin_specific:")
display(peptides_camel_chymosin_specific)

print("peptides_cow_chymosin_specific:")
display(peptides_cow_chymosin_specific)


peptides_camel_chymosin_specific:


Unnamed: 0,protein_name,protein_seq,protein_MW,enzyme,peptide_seq,peptide_MW,peptide_len,peptide_start,peptide_end
0,sp|P79139|CASK_CAMDR Kappa-casein OS=Camelus d...,MKSFFLVVTILALTLPFLGAEVQNQEQPTCFEKVERLLNEKTVKYF...,20399.89,chymosin_specific,MKSFFLVVTILALTLPFLGAEVQNQEQPTCFEKVERLLNEKTVKYF...,8842.4,75,0,74
1,sp|P79139|CASK_CAMDR Kappa-casein OS=Camelus d...,MKSFFLVVTILALTLPFLGAEVQNQEQPTCFEKVERLLNEKTVKYF...,20399.89,chymosin_specific,IPYPNYAKPVAIRLHAQIPQCQALPNIDPPTVERRPRPRPSF,4800.7,42,75,116
2,sp|P79139|CASK_CAMDR Kappa-casein OS=Camelus d...,MKSFFLVVTILALTLPFLGAEVQNQEQPTCFEKVERLLNEKTVKYF...,20399.89,chymosin_specific,IAIPPKKTQDKTVNPAINTVATVEPPVIPTAEPAVNTVVIAEASSEF,4851.65,47,117,163
3,sp|P79139|CASK_CAMDR Kappa-casein OS=Camelus d...,MKSFFLVVTILALTLPFLGAEVQNQEQPTCFEKVERLLNEKTVKYF...,20399.89,chymosin_specific,ITTSTPETTTVQITSTEI,1905.14,18,164,181
4,tr|A0A5N4EEL7|A0A5N4EEL7_CAMDR Kappa-casein OS...,MRMTTKRTKLSSIQFVLIDSCKYLNLAKDFIAKVKSQLTFFSPEKC...,24700.35,chymosin_specific,MRMTTKRTKLSSIQFVLIDSCKYLNLAKDF,3534.29,30,0,29
5,tr|A0A5N4EEL7|A0A5N4EEL7_CAMDR Kappa-casein OS...,MRMTTKRTKLSSIQFVLIDSCKYLNLAKDFIAKVKSQLTFFSPEKC...,24700.35,chymosin_specific,IAKVKSQLTFFSPEKCAIMKSFFLVVTILALTLPFLCCEKVERLLN...,9608.57,82,30,111
6,tr|A0A5N4EEL7|A0A5N4EEL7_CAMDR Kappa-casein OS...,MRMTTKRTKLSSIQFVLIDSCKYLNLAKDFIAKVKSQLTFFSPEKC...,24700.35,chymosin_specific,IPYPNYAKPVAIRLHAQIPQCQALPNIDPPTVERRPRPRPSF,4800.7,42,112,153
7,tr|A0A5N4EEL7|A0A5N4EEL7_CAMDR Kappa-casein OS...,MRMTTKRTKLSSIQFVLIDSCKYLNLAKDFIAKVKSQLTFFSPEKC...,24700.35,chymosin_specific,IAIPPKKTQDKTVNPAINTVATVEPPVIPTAEPAVNTVVIAEASSEF,4851.65,47,154,200
8,tr|A0A5N4EEL7|A0A5N4EEL7_CAMDR Kappa-casein OS...,MRMTTKRTKLSSIQFVLIDSCKYLNLAKDFIAKVKSQLTFFSPEKC...,24700.35,chymosin_specific,ITTSTPETTTVQITSTEI,1905.14,18,201,218
9,tr|W0K8B9|W0K8B9_CAMDR Kappa-casein (Fragment)...,CCEKVERLLNEKTVKYFPIQFVQSRYPSYGINYYQHRLAVPINNQF...,17138.01,chymosin_specific,CCEKVERLLNEKTVKYFPIQFVQSRYPSYGINYYQHRLAVPINNQF,5580.52,46,0,45


peptides_cow_chymosin_specific:


Unnamed: 0,protein_name,protein_seq,protein_MW,enzyme,peptide_seq,peptide_MW,peptide_len,peptide_start,peptide_end
0,bovine kCN A (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18956.7,chymosin_specific,QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,12267.16,105,0,104
1,bovine kCN A (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18956.7,chymosin_specific,MAIPPKKNQDKTEIPTINTIASGEPTSTPTTEAVESTVATLEDSPE...,6689.54,64,105,168
2,bovine kCN B (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18924.74,chymosin_specific,QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,12267.16,105,0,104
3,bovine kCN B (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18924.74,chymosin_specific,MAIPPKKNQDKTEIPTINTIASGEPTSTPTIEAVESTVATLEASPE...,6657.58,64,105,168
4,bovine kCN B2 (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18912.69,chymosin_specific,QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,12267.16,105,0,104
5,bovine kCN B2 (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18912.69,chymosin_specific,MAIPPKKNQDKTEIPTINTIASGEPTSTPTIEAVESTVATLEASPE...,6645.53,64,105,168
6,bovine kCN C (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18937.65,chymosin_specific,QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,12248.11,105,0,104
7,bovine kCN C (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18937.65,chymosin_specific,MAIPPKKNQDKTEIPTINTIASGEPTSTPTTEAVESTVATLEDSPE...,6689.54,64,105,168
8,bovine kCN E (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18926.67,chymosin_specific,QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,12267.16,105,0,104
9,bovine kCN E (P02668),QEQNQEQPIRCEKDERFFSDKIAKYIPIQYVLSRYPSYGLNYYQQK...,18926.67,chymosin_specific,MAIPPKKNQDKTEIPTINTIASGEPTSTPTTEAVESTVATLEDSPE...,6659.51,64,105,168


## Export the outputs as csv files

In [21]:
# exporting the data
peptides_camel_plasmin.to_csv("alpha_beta_casein_peptides_plasmin_camel.csv", sep=',')
peptides_cow_plasmin.to_csv("alpha_beta_casein_peptides_plasmin_cow.csv", sep=',')
peptides_camel_chymosin.to_csv("kappa_casein_peptides_chymosin_camel.csv", sep=',')
peptides_cow_chymosin.to_csv("kappa_casein_peptides_chymosin_cow.csv", sep=',')
peptides_camel_chymosin_specific.to_csv("kappa_casein_peptides_chymosin_specific_camel.csv", sep=',')
peptides_cow_chymosin_specific.to_csv("kappa_casein_peptides_chymosin_specific_cow.csv", sep=',')