**Data set:**

Ov_data.csv: this is a sample data set related to a specific organism. The important column in this data set is names "Info_window_seq". It contains 15-character strings representing fixed-length protein fragments (called "peptides") generated by running a sliding window over the organism protein strings.

**Initial features to calculate:**

Proteins are composed of 20 aminoacids, represented by different letters. Valid letters are: A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y.

1) We need to calculate the percent of each aminoacid in the peptides. 
2) This will result in 20 features (columns), which should be called

feat_perc_A, feat_perc_C, ..., feat_perc_Y.

**Notes:**

1) The frequency is calculated as (number_of_occurrences_of_letter / length_of_peptide)

2) if a letter does not occur in a given peptide it should have a frequency of zero.

3) if an invalid letter appears in the sequence, it should be removed before the frequencies are calculated.


In [52]:
import numpy as np
import pandas as pd

In [137]:
df = pd.read_csv('Ov_data.csv')
df.head()

Unnamed: 0,Info_epitope_id,Info_sourceOrg_id,Info_protein_id,Info_host_id,Info_n_Positive,Info_n_Negative,Info_TSeq_accver,Info_TSeq_taxid,Info_TSeq_orgname,Info_center_pos,Info_window_seq,Class
0,854007,6282,A0A044QLL7,96069606,1,1,,6282,Onchocerca volvulus,1,LLLLLLLLDVHIESG,1
1,854007,6282,A0A044QLL7,96069606,1,1,,6282,Onchocerca volvulus,2,LLLLLLLDVHIESGE,1
2,854007,6282,A0A044QLL7,96069606,1,1,,6282,Onchocerca volvulus,3,LLLLLLDVHIESGEV,1
3,854007,6282,A0A044QLL7,96069606,1,1,,6282,Onchocerca volvulus,4,LLLLLDVHIESGEVC,1
4,854007,6282,A0A044QLL7,96069606,1,1,,6282,Onchocerca volvulus,5,LLLLDVHIESGEVCA,1


**Task 1**
#First check: if invalid letter appears in the sequence, it should be removed before the frequencies are calculated.
- my guess: Exlcude the invalid letter from the length_of_peptide

In [175]:
valid_letters = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

for letter in valid_letters:
    df['feat_perc_{}'.format(letter)] = 0


def peptides_feature_generator(peptide_row):
    #print(peptide_row.name)
    #print(peptide_row['Info_window_seq'][1])
    length_of_peptide = 15
    number_of_occurrences_of_letter_dict = {'A':0,'C':0, 'D':0, 'E':0, 'F':0, 'G':0, 'H':0, 'I':0, 'K':0, 'L':0, 'M':0, 'N':0, 'P':0, 'Q':0, 'R':0, 'S':0, 'T':0, 'V':0, 'W':0, 'Y':0}
    
    # 1) Find out number of occurences of each letter in the peptide & confirm peptide length
    for i in range(15):
        peptide_letter = peptide_row['Info_window_seq'][i]
        print(peptide_letter)
        
        if peptide_letter in valid_letters:
            number_of_occurrences_of_letter_dict[peptide_letter] += 1
        else:
            length_of_peptide -= 1
                
#     # 2) Find out the percentage of each letter in the peptides & Add each letter's 'percentage in the peptides' as a seperate column to the table
#     for key in number_of_occurrences_of_letter_dict:
        
#         df['feat_perc_{}'.format(letter)]
        
#         print((number_of_occurrences_of_letter_dict[key] / length_of_peptide) * 100)
        

    print(number_of_occurrences_of_letter_dict)
    print(length_of_peptide)
    
    
    
    

In [167]:
df.apply(peptides_feature_generator, axis=1)

L
L
L
L
L
L
L
D
V
H
I
E
S
G
E
L
D
V
H
I
E
S
G
E
V
C
A
M
V
F
C
D
R
K
E
N
A
C
I
I
E
R
K
N
R
M
N
K
L
L
I
A
F
G
L
I
L
F
V
A
L
P
C
A
S
Q
E
F
D
Y
E
R
E
P
S
S
W
C
D
L
N
E
N
Q
T
V
D
K
G
C
Y
C
D
R
K
E
N
A
C
I
Q
I
I
T
D
K
D
D
D
F
Q
E
S
D
E
D
I
S
D
D
D
S
Y
E
I
I
N
D
E
D
D
F
Q
Q
I
D
V
N
E
Y
N
D
D
F
Q
Q
N
F
Q
Q
N
I
I
D
E
D
D
F
F
Q
D
I
I
T
D
E
D
D
N
G
D
R
A
V
N
E
Y
N
D
D
F
Q
Q
D
R
A
S
N
E
D
I
C
D
D
V
S
Y
I
I
T
D
K
D
D
N
G
D
K
A
S
D
D
E
D
D
N
G
D
R
A
S
N
E
D
I
C
P
E
H
K
V
D
D
V
N
E
Y
N
C
A
S
Q
E
F
D
Y
E
F
D
D
K
D
N
D
S
Q
E
I
I
N
D
T
K
R
K
F
V
E
S
D
G
I
K
R
T
K
N
R
G
E
L
E
Y
T
Y
C
A
P
D
F
Q
Q
I
I
F
D
D
D
D
F
Q
K
I
V
T
D
K
D
D
F
Q
D
I
I
T
D
E
D
N
E
Y
N
D
D
F
Q
Q
N
I
I
D
Q
T
W
V
D
N
G
C
Y
C
D
R
K
E
N
I
E
R
K
N
R
G
E
L
E
Y
T
Y
C
A
K
F
V
E
S
D
G
I
K
K
T
K
E
N
A
C
I
I
E
R
K
N
R
G
E
L
D
D
F
Q
Q
N
I
I
D
E
D
D
F
D
D
D
F
Q
E
I
I
T
D
K
D
D
N
Q
K
I
I
N
Y
K
D
N
D
S
Q
E
I
G
C
Y
C
D
R
K
E
N
A
C
V
I
E
R
S
Y
T
T
K
G
K
F
V
E
A
C
I
I
E
R
K
N
R
G
E
L
E
Y
T
A
F
G
L
I
I
L
F
V
A
L
P
C
A
S
M
M
M
M
M
M
M
N
K
L
L
I
A
F
P
S
W


A
Q
E
E
K
I
K
Q
L
H
S
R
I
V
K
L
E
G
E
A
G
G
K
N
F
V
I
Q
K
S
E
K
S
L
Y
E
H
P
V
V
K
K
Q
P
K
I
A
H
P
P
K
V
N
V
V
S
K
F
D
R
Q
I
D
Q
I
D
R
R
S
Y
G
D
R
R
D
V
Y
S
S
G
L
A
A
S
M
A
I
I
Q
D
L
D
Q
A
L
K
K
A
Q
A
H
M
A
N
A
L
Q
I
A
T
F
L
E
P
A
A
M
T
H
A
S
V
P
T
E
D
R
I
S
D
H
I
V
C
C
D
D
V
Y
G
G
R
D
V
L
E
R
C
L
A
S
L
E
D
A
E
C
L
A
S
L
E
D
A
E
Y
C
R
V
Y
S
L
E
I
S
F
V
D
M
T
D
L
K
A
S
V
P
T
E
D
R
I
K
L
G
I
G
D
Y
I
N
G
H
S
D
V
V
M
G
A
V
M
T
R
I
S
V
G
I
E
D
A
E
D
L
I
Q
D
G
L
E
E
S
E
E
F
L
S
S
L
K
E
S
L
G
G
F
E
S
L
A
E
L
P
K
I
F
T
L
A
E
S
L
G
G
F
E
S
F
Q
H
P
L
S
F
G
A
D
A
V
L
H
S
A
L
Q
I
A
T
F
L
E
S
N
P
R
I
E
T
F
M
S
P
Y
F
Q
H
P
L
S
F
G
A
C
D
D
V
Y
G
G
K
D
S
S
F
Y
L
K
G
G
L
E
E
S
E
H
K
K
Q
T
K
G
M
S
G
M
M
T
N
A
I
H
A
G
Q
E
A
E
Q
W
H
P
Q
H
E
I
H
K
K
Q
T
K
Y
P
E
L
Q
S
H
P
Q
H
E
I
H
K
G
K
D
S
A
I
I
F
S
S
T
Q
R
D
R
I
K
L
G
I
G
D
N
L
I
R
I
S
W
D
T
N
Q
I
I
P
P
I
S
L
S
T
E
D
A
E
D
L
I
Q
D
L
D
Q
A
L
K
V
M
D
I
A
A
I
V
E
I
T
K
T
F
K
S
N
T
K
M
I
W
L
E
T
P
S
N
P
S
L
D
E
N
L
K
F
Q
Q
L
A
V
G
A
E
F
L
S
S
L
K
I
F
T
L
A
E


E
T
T
T
A
S
M
L
P
I
Y
L
I
I
L
P
G
T
C
E
A
M
E
C
P
P
S
H
K
M
M
M
M
M
M
M
I
Q
L
F
S
T
V
K
C
K
S
P
P
C
Q
P
I
L
A
C
M
H
K
C
M
M
N
P
G
P
T
C
V
K
T
I
A
K
C
V
L
P
G
T
C
E
A
M
E
C
G
L
K
G
F
K
T
C
V
D
Q
F
C
P
A
C
M
P
D
N
T
K
G
C
K
S
H
P
P
M
I
Q
L
F
S
T
T
T
A
S
M
L
P
I
C
E
E
R
D
I
V
K
C
K
S
P
P
C
Y
H
K
W
T
C
D
S
E
T
V
D
T
F
C
D
G
C
A
L
D
K
Y
L
L
N
N
L
H
V
Y
V
K
G
L
Y
D
Q
A
G
I
S
L
P
F
D
S
C
N
V
A
R
T
R
S
L
S
Y
S
I
P
V
D
N
G
V
E
G
E
P
T
I
G
Q
P
V
Y
H
K
W
T
C
D
S
E
E
G
E
E
K
E
E
E
E
P
Y
P
L
L
S
A
I
N
H
E
L
E
G
E
A
I
N
H
E
L
E
G
E
A
T
A
T
T
R
N
V
R
S
L
G
E
N
F
D
L
L
S
D
I
K
C
P
E
S
F
G
L
F
R
P
I
G
M
A
Q
E
Y
A
K
L
V
N
N
E
M
K
K
A
F
T
E
E
A
K
E
T
I
D
E
Q
G
V
G
A
C
M
V
K
V
S
P
R
I
Y
H
I
D
G
S
Y
N
V
R
F
T
E
E
A
K
K
S
G
S
E
K
S
G
S
E
Q
L
L
L
T
A
A
V
E
Y
A
K
L
V
N
E
M
K
K
A
H
I
D
G
S
Y
N
V
R
S
L
G
E
N
F
T
S
C
G
K
G
P
Y
P
L
L
S
A
I
N
E
S
F
G
L
F
R
H
P
N
D
C
R
L
F
R
H
P
N
D
C
R
L
F
I
H
C
A
H
D
G
V
G
A
C
M
V
K
G
N
Q
W
Y
G
Y
L
L
L
T
A
A
V
S
P
R
I
Y
H
I
D
A
K
R
V
T
I
M
P
K
D
I
Q
L
A
R
D
F
E
E
E
G
E
G
E
E
M
P


D
A
F
T
D
D
E
L
A
S
D
S
F
P
M
L
N
H
R
L
Q
E
M
N
C
Y
E
D
M
A
R
F
K
Q
L
Q
F
F
I
G
E
R
M
A
E
E
L
A
S
D
S
F
P
M
K
L
V
D
G
L
D
G
L
I
W
E
F
R
G
R
Q
V
V
N
E
F
K
R
K
I
Q
A
W
V
I
S
L
L
V
R
K
E
G
E
I
V
L
A
G
A
N
P
L
M
Q
K
N
G
K
S
E
V
E
I
N
E
F
L
L
L
L
L
L
L
K
E
L
A
T
N
M
L
M
K
K
V
V
E
L
M
Q
K
N
G
K
R
G
I
D
F
V
L
N
H
R
L
Q
E
M
N
F
F
I
G
E
R
M
A
E
G
Q
G
E
G
S
Y
C
K
S
F
M
K
K
V
V
E
K
S
E
V
E
I
N
E
F
K
R
K
I
Q
A
L
K
E
L
A
T
N
M
L
I
F
K
D
A
F
I
V
L
A
G
A
N
P
S
A
E
G
S
L
L
T
K
D
R
F
K
Q
L
Q
F
F
I
S
E
E
C
V
E
R
G
I
D
F
V
L
N
H
E
D
G
D
E
G
S
E
E
C
V
E
R
G
I
I
Q
A
W
V
I
S
L
L
T
K
D
R
F
K
D
M
A
T
F
K
S
Y
C
K
S
F
M
E
G
Q
V
A
V
V
E
Y
R
D
E
F
R
G
R
Q
V
V
R
K
E
G
E
I
V
E
E
G
E
V
P
Y
L
M
L
V
K
E
A
G
S
V
A
E
Y
R
A
S
E
G
K
T
I
R
A
S
E
G
K
T
I
T
V
P
G
D
I
S
P
K
C
L
P
G
R
I
T
L
G
V
S
S
L
Q
V
K
R
K
A
A
E
D
K
K
K
G
F
D
G
Q
K
A
I
K
M
Q
S
K
I
G
G
S
W
N
L
V
A
K
C
K
E
E
L
R
P
V
T
K
W
A
P
R
A
S
S
S
D
V
V
S
N
M
T
A
T
N
S
N
V
T
S
C
C
D
L
C
D
E
A
W
H
T
W
C
V
D
Y
K
F
G
A
Y
D
E
L
I
Q
E
A
P
E
V
L
L
L
C
D
L
C
D
E
A
W
H
E
T
L
C


D
L
G
D
K
P
N
A
L
G
E
D
Q
L
K
W
H
F
T
T
F
D
L
G
G
H
E
Q
A
R
R
N
L
T
T
G
K
G
Q
I
S
R
M
D
I
S
V
L
R
R
Q
G
Y
G
E
G
F
R
W
R
I
A
E
S
R
H
E
L
E
S
L
L
G
D
D
D
R
M
A
Q
H
V
P
T
L
H
P
T
S
A
S
C
P
L
L
I
L
G
N
K
I
D
K
P
N
Y
L
G
L
T
K
K
N
G
K
L
V
F
L
H
E
L
E
S
L
L
G
D
E
Q
V
A
S
C
N
A
G
K
T
T
L
L
H
M
L
K
D
D
I
S
S
R
P
M
E
V
F
M
C
S
V
L
M
M
M
M
M
M
M
S
W
L
W
D
W
V
S
V
F
L
G
L
D
N
A
G
K
T
T
L
W
H
L
G
V
T
N
L
T
T
G
K
G
W
V
S
G
M
L
N
Y
L
G
L
T
K
K
N
H
V
P
T
L
H
P
T
S
E
E
L
S
D
C
A
D
V
E
R
I
A
E
S
R
H
E
L
G
G
H
E
Q
A
R
R
V
W
K
D
Y
F
P
G
Y
G
E
G
F
R
W
P
T
S
E
E
L
S
L
G
G
I
R
F
T
M
E
V
F
M
C
S
V
L
R
R
Q
G
K
K
N
G
K
L
V
F
L
G
L
D
N
A
G
G
E
D
Q
L
K
W
H
L
G
V
T
N
L
T
M
S
W
L
W
D
W
V
S
G
M
L
N
Y
L
R
R
V
W
K
D
Y
F
P
A
V
D
A
I
V
Q
V
A
Q
A
G
A
C
D
A
C
C
V
A
K
N
R
T
A
R
Q
A
G
Y
G
V
E
K
P
P
S
Q
P
C
E
P
V
T
P
P
P
G
P
P
G
D
A
G
P
S
A
P
S
E
P
G
R
P
G
S
D
G
A
P
G
P
I
G
P
Y
D
R
I
D
P
D
M
D
L
E
C
R
I
K
S
Y
T
A
S
R
E
K
K
R
E
G
Y
I
W
A
E
V
H
E
L
K
N
I
P
V
P
P
G
D
V
G
E
P
G
A
P
G
R
P
G
R
I
K
A
Y
R
F
V
A
Y
S
A
V
T
F
C
K
P
C


R
L
K
T
V
L
L
G
C
R
F
H
E
F
I
W
A
G
F
V
P
F
G
E
I
V
S
I
S
I
P
A
L
M
A
E
L
G
E
K
P
T
S
R
R
T
L
Y
V
G
G
F
G
E
E
V
D
E
E
T
G
K
H
R
G
F
G
F
V
E
Y
E
L
C
T
G
E
K
G
F
G
Y
E
G
S
P
A
G
R
G
W
G
G
V
P
P
P
P
R
P
Y
I
A
A
G
K
P
G
L
G
S
P
I
Y
D
A
S
G
K
R
L
N
T
R
E
K
G
D
G
T
G
G
K
S
I
Y
G
P
P
P
P
M
S
L
G
L
G
F
N
L
A
R
Q
E
G
M
E
G
Y
S
S
F
G
H
V
V
E
G
M
N
I
V
K
A
E
E
G
S
N
G
N
V
R
E
S
Y
K
L
G
R
R
E
G
P
M
P
G
E
N
E
P
S
R
S
R
R
S
G
S
R
S
R
R
S
R
S
I
I
I
R
G
K
G
S
V
K
E
G
K
L
K
T
E
D
D
G
T
G
V
I
S
E
T
S
G
S
G
G
S
G
Y
K
P
K
N
Y
S
L
S
R
S
R
S
K
H
R
R
R
R
G
N
F
V
G
L
L
I
G
P
R
G
N
T
L
K
A
E
T
G
A
K
I
I
I
R
G
K
S
H
F
H
R
I
I
P
R
F
M
V
Q
G
G
V
G
S
A
K
L
K
A
A
Q
G
V
P
M
P
F
V
R
K
F
G
A
A
A
V
Q
G
G
D
F
T
K
G
D
G
T
G
G
K
R
T
G
S
P
P
K
G
P
E
E
Q
N
E
N
E
T
G
Q
P
E
K
K
P
R
K
S
S
M
A
A
K
K
K
K
R
N
N
G
S
V
K
E
G
K
L
G
R
R
E
G
P
M
V
E
C
G
E
L
K
Q
E
R
K
S
S
R
S
R
S
R
H
K
R
K
R
S
R
S
R
D
R
R
H
S
R
K
R
S
H
S
R
R
S
T
I
R
L
H
D
K
V
W
I
P
Q
E
N
H
A
E
N
F
R
Q
L
C
T
G
E
K
G
F
G
F
I
W
E
L
G
L
E
S
G
R
I
V
I
E
N
M
N
D
S
E
L
F
G
R


G
E
K
L
V
L
E
S
K
V
D
S
S
P
F
G
T
T
K
T
E
S
R
M
I
V
T
E
S
L
M
G
G
D
L
E
S
R
N
L
S
K
Q
R
I
I
I
L
E
S
S
F
Q
K
I
T
T
P
T
L
E
S
S
L
E
R
K
E
T
V
H
R
N
E
S
T
I
H
S
N
D
V
H
P
S
L
E
E
S
Y
E
I
S
F
P
P
R
F
E
T
N
I
E
C
N
V
V
I
N
P
L
P
G
E
T
Q
L
S
S
A
K
V
V
A
Q
E
T
S
E
A
W
I
E
A
K
P
L
I
H
Y
E
T
S
Q
R
N
F
T
D
H
E
T
A
H
Y
E
T
S
T
S
E
A
K
I
G
N
C
T
N
V
E
T
V
H
R
N
E
T
Q
D
E
N
V
E
V
E
I
F
V
K
P
L
L
E
C
M
L
E
E
V
E
K
E
E
E
N
L
T
A
A
S
I
E
V
F
P
V
T
S
D
D
K
R
H
E
V
H
S
I
L
T
E
Q
K
Y
E
E
V
H
T
R
S
D
R
I
R
Y
T
R
A
E
E
V
I
Q
T
N
D
I
E
V
V
V
S
C
E
V
I
S
E
T
P
V
R
T
P
E
I
R
K
E
V
L
V
E
E
N
E
L
E
N
S
D
E
V
Q
E
L
V
W
R
K
T
L
T
L
V
V
E
V
Q
S
D
P
P
A
I
Y
I
K
T
H
F
E
V
Q
S
E
A
S
Y
S
L
A
T
L
S
P
E
V
R
A
Q
V
E
A
C
S
K
Q
Y
E
N
E
V
V
V
S
C
E
T
V
S
G
S
Q
E
V
Y
S
H
T
A
E
I
T
E
V
T
E
E
W
G
T
E
S
A
I
T
I
E
S
G
L
D
F
A
D
K
V
Y
L
D
C
T
K
R
R
Q
N
F
A
K
L
K
D
E
R
E
Q
L
C
H
E
T
F
A
N
R
S
I
Y
S
K
T
L
V
A
Q
Q
F
A
Q
I
H
L
S
S
Q
P
A
I
F
E
W
F
C
N
D
K
P
V
S
V
S
M
S
Y
F
C
S
T
A
D
T
D
A
G
Q
A
F
D
A
S
E
E
F
E
V
E
L
Y
M
E
S
F
D
F
H
A


I
T
A
I
L
S
V
F
Q
L
P
L
N
K
E
V
H
V
K
K
G
E
A
S
E
N
R
N
L
S
V
S
E
M
L
N
P
I
S
K
N
M
V
S
S
K
K
S
N
D
I
I
I
E
N
N
E
N
Y
S
P
I
L
K
I
A
N
N
E
P
L
V
L
R
T
I
T
C
E
F
N
N
F
L
A
P
G
S
L
G
C
P
I
L
K
N
N
R
F
F
H
V
V
D
L
M
V
I
C
N
P
H
T
D
V
I
I
K
R
K
N
S
N
Q
L
Y
L
L
R
A
S
F
I
N
Q
S
N
V
E
D
Q
H
I
L
Q
N
R
Q
S
V
I
E
K
L
N
E
I
N
S
D
R
I
I
S
K
E
L
S
T
I
M
E
N
S
K
S
S
K
V
A
L
I
E
W
D
D
N
S
R
D
Y
L
V
V
M
I
E
N
P
R
N
S
S
S
E
D
E
N
I
Y
I
L
K
I
N
S
V
D
R
T
W
S
G
Y
S
N
G
G
S
N
T
N
N
S
S
D
N
L
K
D
V
N
K
L
N
T
T
S
N
Q
I
E
S
V
K
L
D
L
N
T
T
T
F
E
L
L
D
L
K
L
S
N
T
V
E
V
N
L
S
Y
I
N
L
N
V
N
V
E
R
T
P
S
L
F
I
K
I
S
K
E
N
V
S
S
Q
N
Q
P
V
E
P
L
R
G
Q
N
V
S
V
V
S
E
A
S
E
T
N
I
E
C
N
V
V
D
V
G
D
T
S
E
T
E
T
V
P
A
E
R
P
P
R
I
V
E
V
Q
S
D
P
P
A
I
F
E
W
F
C
V
I
S
V
G
I
P
D
I
D
I
P
T
T
C
N
S
F
T
T
P
D
I
H
P
H
P
P
D
I
H
P
H
P
D
L
S
T
R
T
T
K
L
I
T
F
P
E
M
I
R
R
Q
E
D
S
D
Y
L
P
E
S
L
M
H
S
F
L
Y
F
S
S
I
Q
P
E
T
L
V
L
G
Y
S
L
P
H
F
I
Q
P
F
A
L
F
E
K
D
K
Q
N
L
D
N
P
G
N
K
K
L
H
S
K
V
Q
L
P
Q
Q
P
I
F
I
D
R
L
P
T
I
V
E
G
F
V


E
E
V
E
S
R
M
I
V
T
E
S
I
V
E
D
R
T
K
T
P
Q
Q
E
T
E
T
S
I
T
I
E
Q
T
L
S
E
T
E
T
E
V
S
S
N
V
H
I
Y
K
L
R
L
T
T
F
F
E
M
E
V
L
W
E
E
S
N
L
S
T
F
K
Q
K
S
E
K
L
A
T
S
K
D
L
T
F
M
E
V
T
I
V
E
Q
L
K
C
I
V
T
G
E
P
M
P
T
I
R
M
E
L
M
A
S
T
G
E
T
L
T
K
Y
S
L
S
P
T
S
K
T
G
F
F
F
K
A
S
M
M
L
Q
E
T
G
I
N
N
A
P
W
T
S
V
P
P
E
I
T
G
S
S
T
P
K
T
W
L
E
Q
T
H
E
V
N
E
E
M
E
N
S
Y
I
K
T
H
F
E
V
Q
S
E
A
L
L
S
P
M
Q
T
I
A
T
D
I
V
N
K
R
E
F
C
A
D
T
I
F
L
T
S
Q
M
N
N
I
S
K
V
E
T
I
I
G
N
A
T
K
R
K
D
S
K
K
V
T
I
I
N
E
E
D
H
S
Y
W
S
R
E
T
I
N
E
R
V
E
F
E
E
L
C
Q
K
P
T
I
Q
S
S
Q
I
D
L
T
G
E
P
M
P
T
I
R
W
S
C
N
G
K
N
N
E
N
Y
S
T
I
S
D
D
G
I
C
R
E
G
S
T
A
T
I
T
C
E
F
N
N
F
F
E
D
C
N
N
T
I
T
F
N
R
T
P
E
Y
G
R
Y
Y
V
T
I
T
N
E
H
G
I
T
K
A
R
E
S
T
I
T
N
P
L
E
D
E
L
H
A
S
S
N
T
K
A
R
E
S
T
I
T
I
L
P
Q
K
I
T
K
E
K
Q
E
V
D
I
S
I
F
V
T
F
T
K
K
G
D
V
A
H
Q
C
D
A
V
M
R
T
K
L
I
T
F
P
E
M
T
I
I
G
N
A
T
K
R
R
Q
N
F
A
K
F
E
G
A
P
E
T
L
E
K
R
Q
M
R
P
S
M
T
Y
T
L
S
E
T
E
T
E
V
R
V
L
A
T
D
T
L
T
L
V
V
E
V
A
E
S
I
V
L
T
L
V
A
Q
Q
D
H


T
D
T
E
E
N
F
G
G
E
K
F
D
V
I
E
Q
D
T
A
A
E
S
S
A
K
F
K
E
N
S
N
D
S
K
S
L
D
S
K
I
M
E
K
L
K
L
E
K
R
E
T
K
I
T
V
D
L
Q
L
K
F
S
R
K
K
L
H
E
T
V
A
L
K
A
K
T
K
K
M
I
E
M
M
H
S
N
Q
N
K
L
C
G
L
K
G
K
E
V
S
A
K
L
N
D
F
S
G
S
E
K
H
S
E
K
K
M
R
H
H
D
R
L
E
E
N
I
T
Q
K
N
P
T
D
E
L
S
R
K
K
L
L
K
R
K
R
Y
G
G
R
E
R
K
K
E
L
L
K
R
K
W
E
E
K
I
M
E
K
L
K
R
N
S
S
D
D
S
E
R
L
K
N
R
R
K
R
R
S
E
S
G
H
R
G
Y
I
K
S
K
V
T
V
S
N
S
A
T
T
I
N
K
S
A
K
Y
A
P
L
P
Q
Q
P
K
N
P
T
D
E
L
E
F
I
T
E
E
G
N
T
N
T
S
K
L
E
K
R
E
T
K
I
T
P
E
N
A
G
K
L
E
S
P
K
E
F
P
S
S
D
A
S
N
N
L
I
L
E
L
Q
Q
R
I
D
L
I
D
E
V
L
K
V
P
S
D
N
T
N
N
E
N
A
L
A
L
L
E
Q
L
K
K
L
S
E
E
T
D
L
N
L
N
T
S
L
S
H
S
L
A
D
V
P
L
P
K
G
V
Q
M
T
V
E
E
Y
Q
Y
S
L
Q
W
N
D
Y
Y
S
R
R
R
L
M
N
D
L
R
A
E
R
R
R
L
K
R
L
K
R
E
R
L
R
A
T
I
A
M
R
R
F
K
T
S
S
I
L
S
E
E
T
D
L
N
L
L
N
L
N
T
S
L
S
H
L
A
V
Q
P
E
L
I
L
T
E
E
G
R
E
A
V
I
M
H
S
N
Q
N
K
L
E
Q
K
A
K
N
M
K
L
L
K
K
V
R
P
V
D
A
N
N
M
P
I
F
P
F
M
M
M
M
M
M
M
Q
K
N
D
G
N
E
M
K
D
V
S
P
K
M
R
E
I
M
V
N
Y
A
L
R
A
T
I
A
M
R
R
A


Q
H
I
M
K
D
P
A
F
G
Y
V
N
A
F
G
Y
V
N
E
T
N
S
T
L
P
R
N
M
L
L
F
A
C
H
F
T
S
A
F
F
M
R
F
A
C
R
V
Q
P
R
Y
F
C
S
T
H
F
W
G
P
V
A
N
M
N
T
I
L
K
G
L
S
R
Q
S
L
N
H
N
Y
L
H
I
M
K
D
P
A
Q
K
N
P
D
L
I
S
G
P
M
T
T
A
M
Q
M
A
R
F
L
N
H
N
Y
L
H
I
M
M
M
M
M
M
M
M
N
T
I
L
K
N
I
T
L
Q
L
M
Q
M
A
R
F
L
N
H
F
A
C
H
F
T
N
I
T
L
Q
L
M
Q
F
W
G
P
V
A
N
W
G
I
P
I
A
F
A
C
R
V
Q
P
R
N
M
L
A
A
L
A
D
L
Q
K
N
P
D
L
I
S
T
A
L
C
I
Y
S
A
F
F
M
R
F
A
C
I
S
G
P
M
T
T
A
L
C
I
Y
S
A
F
G
L
S
R
Q
S
T
K
E
W
I
S
Y
F
C
T
K
E
W
I
S
Y
F
C
S
T
H
F
E
I
P
L
Q
A
G
T
N
K
F
A
S
G
T
N
K
F
A
S
Q
K
G
M
T
G
F
L
L
L
L
L
L
L
Q
A
V
K
K
L
S
W
L
Q
A
V
K
K
L
S
W
T
F
K
M
Y
H
V
H
H
G
L
S
G
N
V
I
E
S
L
F
N
V
Q
Y
A
A
F
V
H
H
G
L
S
G
N
V
A
G
E
R
G
V
C
P
K
Y
C
A
I
D
G
E
R
G
V
C
P
K
Y
C
A
I
D
G
G
C
E
P
V
T
A
P
P
C
K
P
C
F
L
S
L
V
T
D
I
N
E
H
R
H
R
L
A
F
L
S
L
V
T
D
I
N
P
S
K
S
F
N
F
L
S
T
N
R
V
D
N
P
P
R
P
V
S
H
R
H
R
L
A
F
L
S
W
S
T
P
I
E
K
V
I
E
G
Q
P
P
R
Q
Q
C
I
T
R
N
D
D
T
L
G
V
I
V
D
N
N
K
S
N
F
A
E
S
S
S
I
D
S
N
N
K
S
K
T
Q
L
L
P
P
P
L
P
E
N
P


S
K
G
Q
F
L
K
E
L
P
A
D
V
Y
R
G
G
K
G
F
Q
A
Q
V
R
A
L
P
Y
E
S
T
I
Q
H
G
R
R
T
K
R
K
F
A
E
R
C
P
K
H
I
G
T
I
Q
E
N
R
I
I
F
T
D
V
D
G
C
A
E
R
R
D
D
E
I
Q
E
N
R
I
I
F
T
L
E
H
W
H
N
I
T
C
L
N
F
R
V
I
E
L
A
L
E
H
W
H
N
I
T
C
R
W
E
N
I
D
R
D
S
K
G
Q
F
L
H
T
R
R
E
Q
R
V
I
E
L
A
L
E
H
E
Q
A
Y
Q
I
Y
E
S
T
I
Q
H
G
R
E
N
L
D
L
L
F
L
M
S
Y
D
F
H
A
T
K
A
K
E
A
A
C
S
A
T
D
A
N
L
P
S
Q
A
L
L
K
E
M
N
Q
A
D
S
A
N
L
P
S
D
A
I
D
E
A
A
T
K
A
K
E
A
A
C
R
Y
I
V
K
T
D
A
I
D
E
A
A
T
K
A
A
C
S
A
T
D
L
I
K
R
K
I
S
Q
Q
G
V
E
L
E
K
K
M
L
F
I
L
S
S
F
H
F
Y
K
H
F
I
P
F
H
F
Y
K
H
F
I
P
E
I
S
N
E
K
K
M
M
E
I
L
M
N
T
K
L
E
I
D
L
I
K
R
K
I
S
T
V
S
L
R
K
I
L
M
N
T
K
L
E
I
D
V
Q
N
S
V
Q
I
W
N
N
I
L
F
I
L
S
S
F
H
R
K
T
S
N
L
M
N
Q
A
D
S
A
M
M
M
M
M
M
M
T
F
E
Q
Y
R
Y
I
F
I
P
E
I
S
N
S
F
F
F
S
N
N
S
F
F
F
S
N
S
K
N
F
R
L
E
I
D
V
Q
N
S
V
L
P
K
Y
Y
Y
P
L
R
R
Q
I
W
N
A
L
L
K
E
T
Q
Q
G
V
I
S
T
V
S
L
R
K
T
S
N
L
M
M
T
F
E
Q
Y
R
Y
I
V
K
T
D
A
I
N
S
V
L
P
K
Y
Y
P
L
R
R
K
T
A
G
T
D
A
G
K
S
W
W
G
G
N
M
V
G
E
V
I
Y
Q
R
G
N
P
M
I
L
F


S
Q
F
F
I
T
L
K
K
T
P
W
M
M
M
M
M
M
M
V
W
Q
C
E
S
L
T
V
K
N
F
I
Q
L
S
K
N
T
K
G
E
E
G
M
K
V
V
R
K
M
E
N
T
K
T
M
V
W
Q
C
E
S
L
L
L
L
I
L
C
V
G
P
D
S
N
G
S
Q
F
F
I
T
L
K
G
K
T
V
P
E
T
V
K
N
F
I
Q
L
S
L
D
G
R
H
V
V
F
G
K
V
L
E
G
I
I
K
D
F
M
V
Q
G
G
D
F
I
N
H
Y
G
A
G
W
V
S
M
A
N
A
G
P
D
E
P
K
G
P
K
V
T
D
K
V
I
F
T
I
D
E
E
I
G
K
V
T
I
G
L
F
G
K
T
Q
P
T
A
P
S
A
A
P
P
A
A
S
S
P
M
S
A
R
S
S
P
S
V
M
V
R
R
R
M
A
S
P
K
P
S
P
P
P
P
V
H
R
R
A
S
S
P
M
S
Q
Q
Y
R
N
P
C
E
F
E
W
K
Q
F
V
Q
F
V
E
C
T
E
T
Q
H
D
L
S
L
A
A
P
P
A
Q
F
G
M
A
P
P
I
G
S
A
V
G
H
A
V
G
S
M
L
T
G
L
T
G
G
S
G
H
G
N
Q
D
G
I
A
L
H
G
N
Q
D
G
I
A
L
P
E
K
Q
T
A
G
G
V
A
I
G
S
A
V
G
H
A
V
H
A
V
G
S
M
L
T
G
G
S
G
H
G
S
R
G
P
G
L
M
G
Q
M
A
A
T
A
M
M
M
M
M
M
M
V
R
R
R
M
A
S
L
C
Q
S
F
N
E
V
F
K
D
A
S
P
K
P
S
P
P
V
H
R
R
A
S
I
A
L
P
E
K
Q
E
M
Q
Q
C
E
F
E
W
K
Q
F
V
E
C
T
E
T
Q
V
P
M
Q
T
R
Q
P
T
A
P
Q
E
M
Q
Q
Q
Q
Q
Y
R
N
P
C
E
T
Q
H
D
L
S
L
C
Q
S
F
N
E
V
F
G
M
A
P
P
S
R
G
P
G
L
M
G
M
G
Q
M
A
A
T
A
G
G
V
A
I
G
A
R
S
S
P
S
V
P
M
Q
T
R
Q
P
P
D
E
M
S


N
E
L
S
S
A
I
R
D
I
D
L
Q
A
M
A
A
R
D
T
T
S
E
N
K
E
G
E
E
R
A
T
H
I
Q
R
R
K
M
R
E
E
C
Q
A
L
M
V
E
T
L
Q
E
E
I
D
F
I
R
R
V
Q
D
Q
Q
D
Q
E
I
R
D
L
Q
A
M
A
A
R
D
P
E
Q
L
I
F
E
A
E
E
S
F
G
M
G
V
A
L
S
N
I
E
A
E
I
N
L
E
K
V
R
F
L
E
A
Q
N
R
K
L
N
A
N
T
H
R
S
K
E
E
L
I
G
E
W
K
L
G
K
L
A
D
L
E
G
R
N
S
L
L
E
K
K
Q
R
D
E
L
E
K
Q
I
K
K
R
L
A
D
Y
I
E
K
V
R
F
L
E
A
Q
L
E
K
Q
I
Q
E
L
N
F
Q
L
E
D
D
N
Q
R
L
V
S
E
L
Q
R
A
R
T
D
L
A
R
D
T
T
S
E
N
R
E
F
F
K
G
L
K
Q
L
V
E
Q
V
V
R
T
I
R
D
A
R
E
R
E
K
K
E
M
G
N
V
S
I
Q
E
T
S
P
D
G
K
F
I
Y
R
K
M
L
E
G
E
G
D
G
P
G
I
W
G
R
G
Q
G
G
V
H
A
P
P
E
M
L
R
G
R
M
G
K
D
T
S
S
I
K
G
E
G
D
G
P
G
L
K
Q
L
E
A
E
E
S
F
G
M
G
S
N
V
Q
T
Y
T
R
S
E
K
G
N
V
S
I
Q
E
T
S
S
G
M
S
P
F
G
Q
N
A
A
S
T
I
R
R
Q
T
M
E
Q
G
Y
Q
R
E
E
V
K
R
W
K
L
K
R
K
I
D
G
K
R
E
I
V
K
L
K
V
Q
E
I
Q
T
Q
S
A
R
I
D
G
K
R
E
I
V
Y
T
L
P
R
P
S
K
T
V
K
I
W
G
R
G
Q
G
G
E
T
S
P
D
G
K
F
I
V
L
E
N
T
H
D
M
E
S
W
Y
K
L
K
V
Q
E
I
Q
E
G
R
N
S
L
L
E
K
Q
I
Q
E
L
N
L
R
R
R
I
A
L
L
E
E
E
A
E
I
N
L
L
R
R
R
I
E
K
Q
I
K
K
M
Q
E
E


W
Y
G
Y
D
N
E
E
T
I
K
I
G
C
G
V
A
T
Q
C
D
G
G
R
T
F
P
A
I
V
V
A
V
T
G
Y
N
C
P
L
H
F
T
Q
M
A
W
G
K
T
Y
K
I
G
V
G
E
N
V
Y
A
Y
W
S
S
V
A
V
T
G
Y
N
C
P
G
G
K
L
T
A
G
N
P
C
K
V
D
K
D
C
Y
T
K
K
E
V
I
Y
H
R
G
N
P
C
K
V
D
K
D
A
T
Q
C
D
G
G
R
T
L
I
V
I
C
H
G
R
T
L
I
V
I
C
H
Y
S
P
G
G
N
R
W
A
N
Q
C
I
F
G
H
S
P
A
W
G
K
T
Y
K
I
G
C
G
V
A
T
Q
D
K
D
C
Y
T
K
K
C
L
S
K
L
I
N
G
K
L
K
N
R
N
G
T
Y
S
V
E
G
L
K
K
T
A
G
T
L
T
W
D
C
K
L
E
S
S
A
Q
R
V
A
G
Q
G
V
L
H
F
T
Q
M
A
W
G
Y
E
N
N
P
S
N
N
M
T
W
I
F
G
H
S
P
R
Q
Q
R
E
G
V
G
E
L
E
S
S
A
Q
R
W
A
N
Q
C
I
A
Y
W
S
S
V
S
V
E
G
C
P
G
G
K
L
T
A
L
E
R
K
K
I
V
N
N
M
T
W
K
V
A
G
Q
G
V
L
H
F
R
Q
Q
R
E
G
V
G
E
N
V
Y
A
Y
W
S
K
L
P
K
L
Y
E
N
N
P
S
W
G
Y
S
Y
Y
G
G
Q
Q
D
F
G
I
G
G
G
Q
Q
D
F
G
I
G
D
Y
P
T
F
N
M
E
N
M
K
N
I
L
V
L
L
I
V
M
M
M
M
M
M
M
E
N
M
K
N
I
L
V
V
L
L
A
I
A
S
V
E
S
F
G
W
G
Y
I
L
V
L
L
I
V
L
L
A
I
A
S
V
E
S
V
E
S
F
G
W
G
Y
S
Y
Y
G
G
Q
D
V
V
E
M
T
V
N
E
I
I
N
E
V
F
N
T
S
A
A
L
F
A
I
I
S
A
Q
N
W
A
D
Q
C
I
F
G
Y
S
H
C
D
D
G
E
A
F
I
V
V
C
H
F
P
A
I
I
V
A
V
T


N
V
L
S
T
F
T
T
K
F
V
K
S
N
Q
M
R
K
K
A
A
Q
E
D
C
L
S
N
D
E
E
L
N
N
A
D
E
K
N
E
N
Y
S
L
N
I
K
L
G
L
K
P
Q
N
F
A
K
M
E
N
M
N
E
N
Y
S
L
N
I
K
K
M
D
A
S
V
I
C
S
F
F
N
K
F
A
V
F
L
G
L
A
S
K
Y
L
S
N
L
L
K
Q
H
N
F
A
K
M
E
N
M
F
T
S
F
H
I
T
E
D
S
S
S
A
P
A
V
A
E
I
G
E
G
D
K
C
E
S
P
K
Q
E
V
N
A
H
I
K
F
C
S
L
I
P
S
L
E
T
E
Y
L
Q
K
E
N
S
A
P
P
T
E
I
I
E
D
R
I
Y
S
A
E
Q
D
D
H
S
D
A
D
L
V
I
R
I
Y
S
A
E
L
H
L
M
L
M
R
K
L
T
F
K
S
A
R
C
A
A
S
D
L
R
L
L
P
I
G
Y
K
P
F
L
D
R
R
V
L
P
R
R
R
K
L
T
F
K
S
A
R
L
E
V
W
R
V
L
P
R
R
S
A
R
S
A
A
D
I
Y
Q
D
E
S
A
V
K
K
K
A
I
Q
K
L
Y
S
D
P
S
F
A
V
I
C
A
S
K
E
I
D
S
N
N
K
F
K
N
P
I
Y
G
Y
L
H
T
T
V
A
T
K
L
A
I
M
R
K
K
A
A
V
E
E
E
S
K
K
S
D
P
S
F
A
V
I
C
S
F
F
N
K
F
S
A
R
L
E
V
W
E
K
Y
L
L
K
R
L
L
P
I
G
Y
D
K
E
G
L
A
Y
L
Y
L
Q
L
E
R
Y
G
Y
L
H
T
T
V
A
P
S
L
E
T
E
Y
L
Q
L
E
R
Y
G
Y
M
K
N
L
I
Q
A
L
I
L
I
G
L
T
L
E
I
D
G
D
C
E
I
H
V
S
I
I
G
D
L
V
N
L
R
L
N
N
V
W
G
R
E
E
R
E
G
T
L
P
I
L
P
I
V
K
G
E
I
A
D
I
T
V
I
N
L
H
L
S
I
R
F
D
E
G
K
M
V
K
R
I
D
I
N
F
H
K
G
P
S
R
N
L
Q
I


G
P
D
L
S
I
Q
N
A
V
D
C
T
W
T
L
G
T
T
D
F
K
K
A
V
K
K
Y
G
D
T
L
G
N
Y
G
C
R
G
G
Y
M
N
G
V
Y
S
N
R
D
C
G
D
L
N
H
A
V
K
K
Y
G
D
G
K
E
S
Q
K
T
G
K
L
V
D
L
S
I
Q
N
A
V
D
E
V
L
G
T
F
D
Q
D
Y
K
R
G
A
A
I
G
A
L
E
A
Y
N
K
K
K
S
F
L
K
K
M
E
D
N
G
E
L
R
A
M
Y
T
Y
D
P
N
E
K
L
P
V
S
V
A
I
Q
R
G
D
E
L
G
L
M
H
A
V
A
I
F
E
S
N
E
L
M
T
E
A
T
K
H
Y
D
S
N
E
S
N
L
R
M
A
R
A
M
E
K
L
E
T
E
W
N
D
Y
V
Q
V
Q
Q
L
R
E
V
L
G
T
F
D
Q
D
V
D
W
R
K
K
G
M
V
T
P
V
K
D
Q
D
Y
K
R
G
N
M
T
R
L
T
K
N
Q
G
V
C
G
S
C
Y
A
F
A
A
I
F
K
F
Y
K
S
G
V
Y
S
N
R
D
C
G
H
A
V
L
L
V
G
Y
G
K
H
K
T
Y
D
C
G
D
L
N
H
A
V
L
L
V
G
T
R
R
Q
T
R
H
T
V
G
Q
K
Y
G
T
D
W
G
K
K
G
Y
A
Y
M
A
R
N
Y
V
M
A
L
G
K
H
Y
D
S
N
E
S
G
M
V
T
P
V
K
N
Q
G
V
C
G
D
G
K
E
S
Q
K
S
T
V
L
Q
S
F
L
E
A
Y
N
K
K
K
T
G
K
L
V
D
L
S
A
R
N
K
G
N
M
C
H
I
A
T
N
G
L
R
F
P
N
E
T
H
L
R
K
K
F
K
M
M
N
G
L
R
F
P
C
R
G
G
Y
M
N
P
I
F
Y
Y
A
T
N
R
K
Y
E
Q
G
L
I
S
Y
T
V
T
L
N
A
Q
V
Q
Q
L
R
E
V
L
E
D
N
G
E
L
R
A
M
E
K
L
E
T
E
K
S
T
V
L
Q
S
F
L
K
K
M
E
D
N
E
E
N
A
Y
A
T
D
K
G
Y
A
A
I
Q
A
V
D
C
T


F
T
Q
F
M
E
E
V
K
K
A
R
A
M
M
M
M
M
M
M
K
F
V
I
L
V
I
D
E
F
Y
N
L
L
K
T
T
T
R
Q
K
S
Q
Q
I
Q
A
I
M
A
I
P
Q
R
R
Q
Q
Q
R
R
Q
Q
Q
Q
Q
Q
Q
Q
Q
Q
Q
Q
Q
Q
R
D
E
R
E
V
V
A
A
I
P
Q
R
R
E
N
K
T
D
Q
Q
T
E
A
D
V
E
A
F
A
R
A
D
Y
E
R
I
H
Q
Q
A
V
A
M
D
S
L
S
E
S
V
R
R
E
I
I
N
A
D
S
P
H
L
T
T
R
Q
K
S
R
I
H
Q
Q
A
V
A
R
F
S
P
A
A
E
G
A
P
P
S
V
I
D
E
F
Y
L
G
G
S
Y
K
V
R
F
T
Q
F
M
E
E
L
T
I
G
L
L
V
V
A
A
I
M
E
T
R
V
A
G
Q
G
Q
P
K
R
V
G
S
Q
A
G
W
N
K
G
D
S
Q
K
L
M
T
R
V
G
R
W
T
L
A
Q
L
R
K
G
D
S
Q
K
L
M
T
N
F
G
T
P
A
Q
A
N
A
T
M
E
T
R
V
A
G
Q
G
G
Q
G
Q
P
K
R
V
G
R
W
T
L
A
Q
T
D
G
I
I
P
S
Q
A
G
W
N
K
G
D
L
A
Q
L
R
Q
T
D
G
I
I
P
L
M
T
N
F
G
T
P
R
N
T
Q
T
R
V
T
P
R
N
T
Q
T
R
V
K
S
E
N
L
L
N
A
K
T
I
A
I
Q
G
S
G
W
G
W
W
G
W
L
A
Y
D
K
E
M
K
R
L
Q
I
P
L
F
C
I
D
V
W
E
H
A
Y
Y
L
L
P
D
L
P
Y
D
Y
G
A
L
E
P
P
N
Q
D
L
L
E
P
T
T
G
L
I
K
D
F
G
S
L
E
T
M
I
D
K
L
N
A
K
E
A
L
A
K
G
D
T
Q
A
A
V
A
G
T
A
V
K
N
S
G
E
P
N
S
E
M
N
F
N
T
G
G
H
I
N
H
I
M
Q
V
H
H
G
K
H
H
A
A
Y
V
V
K
A
I
W
K
I
A
N
W
K
I
I
S
D
M
M
M
M
N
L
I
I
G
V
A
G


0        None
1        None
2        None
3        None
4        None
         ... 
88837    None
88838    None
88839    None
88840    None
88841    None
Length: 88842, dtype: object

In [169]:
for letter in valid_letters:
    df['feat_perc_{}'.format(letter)] = 0
    
df.columns

<pandas.core.indexing._iLocIndexer at 0x216081656d8>

In [174]:
peptides_feature_generator(df.iloc[0])

L
L
L
L
L
L
L
L
D
V
H
I
E
S
G
{'A': 0, 'C': 0, 'D': 1, 'E': 1, 'F': 0, 'G': 1, 'H': 1, 'I': 1, 'K': 0, 'L': 1, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 1, 'T': 0, 'V': 1, 'W': 0, 'Y': 0}
15


In [133]:
df = pd.DataFrame([[1,2,3],[4,5,6]], columns=['a','b','c'])

def rowFunc(row):
    return row['a'] + row['b'] * row['c']

def rowIndex(row):
    return row.name

df['d'] = df.apply(rowFunc, axis=1)

df['rowIndex'] = df.apply(rowIndex, axis=1)
df


Unnamed: 0,a,b,c,d,rowIndex
0,1,2,3,7,0
1,4,5,6,34,1


In [143]:
df['Info_window_seq'].value_counts()

KTHFEVQSEASYSDA    7
KRDVSKTPQQETETS    6
AFIGQPAPNFKTTAV    6
EENSYIKTHFEVQSE    6
DFKEISLNQFKGKYV    6
                  ..
RAAXEKDTVEVVAVN    1
SSSSSENLAEIPEEV    1
MMMMMMDAPPAPPPP    1
PSRKKLLKRKRYGGR    1
MMNGLRFPNETHLRT    1
Name: Info_window_seq, Length: 39755, dtype: int64

In [144]:
df[df['Info_window_seq'] == 'KTHFEVQSEASYSDA']

Unnamed: 0,Info_epitope_id,Info_sourceOrg_id,Info_protein_id,Info_host_id,Info_n_Positive,Info_n_Negative,Info_TSeq_accver,Info_TSeq_taxid,Info_TSeq_orgname,Info_center_pos,Info_window_seq,Class
12450,852162,6282,A0A044RXP7,96069606,1,1,,6282,Onchocerca volvulus,58,KTHFEVQSEASYSDA,1
12594,856598,6282,A0A044RXP7,96069606,2,0,,6282,Onchocerca volvulus,58,KTHFEVQSEASYSDA,1
34144,852324,6282,A0A044UK00,96069606,0,2,,6282,Onchocerca volvulus,5454,KTHFEVQSEASYSDA,-1
41785,856067,6282,A0A044UK00,96069606,0,2,,6282,Onchocerca volvulus,5337,KTHFEVQSEASYSDA,-1
41800,856068,6282,A0A044UK00,96069606,2,0,,6282,Onchocerca volvulus,5571,KTHFEVQSEASYSDA,1
43618,856597,6282,A0A044UK00,96069606,1,1,,6282,Onchocerca volvulus,5454,KTHFEVQSEASYSDA,1
45750,857471,6282,A0A044UK00,96069606,0,2,,6282,Onchocerca volvulus,5337,KTHFEVQSEASYSDA,-1


array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])