In [6]:
import os
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

## Training

In [3]:
file_in  = os.path.join('..', 'data', 'MLCPP2_Training.fasta')
file_out = os.path.join('..', 'data', 'MLCPP2_Training_equal_length.fasta')

with open(file_out, 'w') as f_out:
    for seq_record in SeqIO.parse(open(file_in, mode='r'), 'fasta'):
        seq_record.seq = seq_record.seq[0] + seq_record.seq[-5:] 
        # write new fasta file
        r=SeqIO.write(seq_record, f_out, 'fasta')
        if r!=1: print('Error while writing sequence:  ' + seq_record.id)

## Indipendent

In [4]:
file_in  = os.path.join('..', 'data', 'MLCPP2_Independent.fasta')
file_out = os.path.join('..', 'data', 'MLCPP2_Independent_equal_length.fasta')

with open(file_out, 'w') as f_out:
    for seq_record in SeqIO.parse(open(file_in, mode='r'), 'fasta'):
        seq_record.seq = seq_record.seq[0] + seq_record.seq[-5:] 
        # write new fasta file
        r=SeqIO.write(seq_record, f_out, 'fasta')
        if r!=1: print('Error while writing sequence:  ' + seq_record.id)

## Optimized peptides

In [10]:
df1 = pd.read_csv('../data/PeptidesForShap.csv')
df2 = pd.read_csv('../data/PeptidesForShap2.csv')
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,Original,Optimized
0,GEGAHVKIVANLPYYITTPILMQLLRAKISW,WEGAHAKIAARLPYYLTTPLLMQLLRAKMKW
1,GEPWKVCVN,GKPWKICVR
2,VISANIRTTISGMDDSGGTGSRQKFRKLRKIERE,CISANIRTTISGMDDSGGTGKRKKFRKLRKIKRK
3,LDPIVAKRVRHILTENARTVEA,LDPIVKKRRRHILTENTRTVEA
4,DAWRMHMQEFVAQLETR,KAWRMHMQKFVARLKTR
5,IFSNTALVNCMRQTLQDTGHNP,IFKRTALINCRRRTLQDTGHNP
6,ETGGEFGKGVIRAELDVKPDLWFFGCHF,WTGGKFGKGKIRGKLDIKPDLWFFGCHF
7,TIVARSIAVGLQEAVREMKRLVQN,TIVARSIAVGLQEAIRRRKRLVQN
8,PNILTGAIVGGP,PNILTRAILGGP
9,TAVEQKYRFFSYGDAM,CAAKKKYRFFSYGDAM


In [11]:
Original_nonCPP = pd.DataFrame(index=range(len(df)), columns=['ID', 'Sequence', 'CPP', 'Dataset'])
Original_nonCPP['ID'] = [f'Negative_{i}' for i in range(1, len(df)+1)]
Original_nonCPP['Sequence'] = df['Original'].values
Original_nonCPP['CPP'] = 0
Original_nonCPP['Dataset'] = 'Original_nonCPP'

Optimized = pd.DataFrame(index=range(len(df)), columns=['ID', 'Sequence', 'CPP', 'Dataset'])
Optimized['ID'] = [f'Positive_{i}' for i in range(1, len(df)+1)]
Optimized['Sequence'] = df['Optimized'].values
Optimized['CPP'] = 1
Optimized['Dataset'] = 'Optimized_CPP'

test_optimized = pd.concat([Original_nonCPP, Optimized], ignore_index=True)
test_optimized

Unnamed: 0,ID,Sequence,CPP,Dataset
0,Negative_1,GEGAHVKIVANLPYYITTPILMQLLRAKISW,0,Original_nonCPP
1,Negative_2,GEPWKVCVN,0,Original_nonCPP
2,Negative_3,VISANIRTTISGMDDSGGTGSRQKFRKLRKIERE,0,Original_nonCPP
3,Negative_4,LDPIVAKRVRHILTENARTVEA,0,Original_nonCPP
4,Negative_5,DAWRMHMQEFVAQLETR,0,Original_nonCPP
5,Negative_6,IFSNTALVNCMRQTLQDTGHNP,0,Original_nonCPP
6,Negative_7,ETGGEFGKGVIRAELDVKPDLWFFGCHF,0,Original_nonCPP
7,Negative_8,TIVARSIAVGLQEAVREMKRLVQN,0,Original_nonCPP
8,Negative_9,PNILTGAIVGGP,0,Original_nonCPP
9,Negative_10,TAVEQKYRFFSYGDAM,0,Original_nonCPP


In [12]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

file_out = os.path.join('..', 'data', 'MLCPP2_Test_optimized.fasta')

with open(file_out, 'w') as f_out:
    for i, row in test_optimized.iterrows():
        seq_record = SeqRecord(seq=Seq(row.Sequence), id=row.ID, name=row.ID, description=row.ID)
        r = SeqIO.write(seq_record, f_out, 'fasta')

In [13]:
file_out = os.path.join('..', 'data', 'MLCPP2_Test_optimized_equal_length.fasta')

with open(file_out, 'w') as f_out:
    for i, row in test_optimized.iterrows():
        seq_record = SeqRecord(seq=Seq(row.Sequence[0] + row.Sequence[-5:]), id=row.ID, name=row.ID, description=row.ID)
        r = SeqIO.write(seq_record, f_out, 'fasta')