**Data set:**

Ov_data.csv: this is a sample data set related to a specific organism. The important column in this data set is names "Info_window_seq". It contains 15-character strings representing fixed-length protein fragments (called "peptides") generated by running a sliding window over the organism protein strings.

**Initial features to calculate:**

Proteins are composed of 20 aminoacids, represented by different letters. Valid letters are: A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y.

1) We need to calculate the percent of each aminoacid in the peptides. 
2) This will result in 20 features (columns), which should be called

feat_perc_A, feat_perc_C, ..., feat_perc_Y.

**Notes:**

1) The frequency is calculated as (number_of_occurrences_of_letter / length_of_peptide)

2) if a letter does not occur in a given peptide it should have a frequency of zero.

3) if an invalid letter appears in the sequence, it should be removed before the frequencies are calculated.


In [208]:
import numpy as np
import pandas as pd

In [387]:
df = pd.read_csv('Ov_data.csv')

valid_letters = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

for letter in valid_letters:
    df['feat_perc_{}'.format(letter)] = 0
    
df.head()

Unnamed: 0,Info_epitope_id,Info_sourceOrg_id,Info_protein_id,Info_host_id,Info_n_Positive,Info_n_Negative,Info_TSeq_accver,Info_TSeq_taxid,Info_TSeq_orgname,Info_center_pos,...,feat_perc_M,feat_perc_N,feat_perc_P,feat_perc_Q,feat_perc_R,feat_perc_S,feat_perc_T,feat_perc_V,feat_perc_W,feat_perc_Y
0,854007,6282,A0A044QLL7,96069606,1,1,,6282,Onchocerca volvulus,1,...,0,0,0,0,0,0,0,0,0,0
1,854007,6282,A0A044QLL7,96069606,1,1,,6282,Onchocerca volvulus,2,...,0,0,0,0,0,0,0,0,0,0
2,854007,6282,A0A044QLL7,96069606,1,1,,6282,Onchocerca volvulus,3,...,0,0,0,0,0,0,0,0,0,0
3,854007,6282,A0A044QLL7,96069606,1,1,,6282,Onchocerca volvulus,4,...,0,0,0,0,0,0,0,0,0,0
4,854007,6282,A0A044QLL7,96069606,1,1,,6282,Onchocerca volvulus,5,...,0,0,0,0,0,0,0,0,0,0


**Task 1**
#First check: if invalid letter appears in the sequence, it should be removed before the frequencies are calculated.
- my guess: Exlcude the invalid letter from the length_of_peptide

In [397]:


def peptides_feature_generator(peptide_row):

    length_of_peptide = 15
    number_of_occurrences_of_letter_dict = {'A':0,'C':0, 'D':0, 'E':0, 'F':0, 'G':0, 'H':0, 'I':0, 'K':0, 'L':0, 'M':0, 'N':0, 'P':0, 'Q':0, 'R':0, 'S':0, 'T':0, 'V':0, 'W':0, 'Y':0}
    
    # 1) Find out number of occurences of each letter in the peptide & confirm peptide length
    for i in range(15):
        peptide_letter = peptide_row['Info_window_seq'][i]
        
        if peptide_letter in valid_letters:
            number_of_occurrences_of_letter_dict[peptide_letter] += 1
        else:
            length_of_peptide -= 1
                
    # 2) Find out the percentage of each letter in the peptides & Add each letter's 'percentage in the peptides' as a seperate column to the table
    for key, value in number_of_occurrences_of_letter_dict.items():

        df.loc[peptide_row.name, 'feat_perc_{}'.format(key)] = (value / length_of_peptide) * 100

    
    
    

In [391]:
df.loc[range(10)].apply(peptides_feature_generator, axis=1)

df.loc[range(10), ['feat_perc_A',
       'feat_perc_C', 'feat_perc_D', 'feat_perc_E', 'feat_perc_F',
       'feat_perc_G', 'feat_perc_H', 'feat_perc_I', 'feat_perc_K',
       'feat_perc_L', 'feat_perc_M', 'feat_perc_N', 'feat_perc_P',
       'feat_perc_Q', 'feat_perc_R', 'feat_perc_S', 'feat_perc_T',
       'feat_perc_V', 'feat_perc_W', 'feat_perc_Y']]

Unnamed: 0,feat_perc_A,feat_perc_C,feat_perc_D,feat_perc_E,feat_perc_F,feat_perc_G,feat_perc_H,feat_perc_I,feat_perc_K,feat_perc_L,feat_perc_M,feat_perc_N,feat_perc_P,feat_perc_Q,feat_perc_R,feat_perc_S,feat_perc_T,feat_perc_V,feat_perc_W,feat_perc_Y
0,0.0,0.0,6.666667,6.666667,0.0,6.666667,6.666667,6.666667,0.0,53.333333,0.0,0.0,0.0,0.0,0.0,6.666667,0.0,6.666667,0.0,0.0
1,0.0,0.0,6.666667,13.333333,0.0,6.666667,6.666667,6.666667,0.0,46.666667,0.0,0.0,0.0,0.0,0.0,6.666667,0.0,6.666667,0.0,0.0
2,0.0,0.0,6.666667,13.333333,0.0,6.666667,6.666667,6.666667,0.0,40.0,0.0,0.0,0.0,0.0,0.0,6.666667,0.0,13.333333,0.0,0.0
3,0.0,6.666667,6.666667,13.333333,0.0,6.666667,6.666667,6.666667,0.0,33.333333,0.0,0.0,0.0,0.0,0.0,6.666667,0.0,13.333333,0.0,0.0
4,6.666667,6.666667,6.666667,13.333333,0.0,6.666667,6.666667,6.666667,0.0,26.666667,0.0,0.0,0.0,0.0,0.0,6.666667,0.0,13.333333,0.0,0.0
5,6.666667,6.666667,6.666667,13.333333,0.0,6.666667,6.666667,6.666667,0.0,20.0,6.666667,0.0,0.0,0.0,0.0,6.666667,0.0,13.333333,0.0,0.0
6,6.666667,6.666667,6.666667,13.333333,0.0,6.666667,6.666667,6.666667,0.0,13.333333,6.666667,0.0,0.0,0.0,0.0,6.666667,0.0,20.0,0.0,0.0
7,6.666667,6.666667,6.666667,13.333333,6.666667,6.666667,6.666667,6.666667,0.0,6.666667,6.666667,0.0,0.0,0.0,0.0,6.666667,0.0,20.0,0.0,0.0
8,6.666667,6.666667,6.666667,13.333333,6.666667,6.666667,6.666667,6.666667,0.0,0.0,6.666667,0.0,0.0,0.0,6.666667,6.666667,0.0,20.0,0.0,0.0
9,6.666667,6.666667,0.0,13.333333,6.666667,6.666667,6.666667,6.666667,6.666667,0.0,6.666667,0.0,0.0,0.0,6.666667,6.666667,0.0,20.0,0.0,0.0


In [347]:
name = 'feat_perc_Y' #A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y.
df[(df[name] != 0)  & (df[name] <  6.666666)][name]



Series([], Name: feat_perc_Y, dtype: float64)

In [380]:
def test_for_invalidletters(peptide):
    if len(list({'B','J','O','U','X', 'Z'} - set(list(peptide)))) < 6:
        return True
    else:
        return False

**Test case: Where peptide length varies and also test peptides with invalid letters. All to see if the correct result will be shown**

In [398]:
data = {'Info_window_seq': ['KTHFEVQSEASYSDA', 'BBHFEVQSEOSYSDA', 'PSRKKLLKRKRZZZZZGGR']}
df_test = pd.DataFrame(data)

valid_letters = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

for letter in valid_letters:
    df_test['feat_perc_{}'.format(letter)] = 0
    
    
df_test


Unnamed: 0,Info_window_seq,feat_perc_A,feat_perc_C,feat_perc_D,feat_perc_E,feat_perc_F,feat_perc_G,feat_perc_H,feat_perc_I,feat_perc_K,...,feat_perc_M,feat_perc_N,feat_perc_P,feat_perc_Q,feat_perc_R,feat_perc_S,feat_perc_T,feat_perc_V,feat_perc_W,feat_perc_Y
0,KTHFEVQSEASYSDA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,BBHFEVQSEOSYSDA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,PSRKKLLKRKRZZZZZGGR,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [399]:
df_test.apply(peptides_feature_generator, axis=1)

df_test #why is this not working?

Unnamed: 0,Info_window_seq,feat_perc_A,feat_perc_C,feat_perc_D,feat_perc_E,feat_perc_F,feat_perc_G,feat_perc_H,feat_perc_I,feat_perc_K,...,feat_perc_M,feat_perc_N,feat_perc_P,feat_perc_Q,feat_perc_R,feat_perc_S,feat_perc_T,feat_perc_V,feat_perc_W,feat_perc_Y
0,KTHFEVQSEASYSDA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,BBHFEVQSEOSYSDA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,PSRKKLLKRKRZZZZZGGR,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
df['Info_window_seq'].value_counts()

KTHFEVQSEASYSDA    7
KRDVSKTPQQETETS    6
AFIGQPAPNFKTTAV    6
EENSYIKTHFEVQSE    6
DFKEISLNQFKGKYV    6
                  ..
RAAXEKDTVEVVAVN    1
SSSSSENLAEIPEEV    1
MMMMMMDAPPAPPPP    1
PSRKKLLKRKRYGGR    1
MMNGLRFPNETHLRT    1
Name: Info_window_seq, Length: 39755, dtype: int64

In [144]:
df[df['Info_window_seq'] == 'KTHFEVQSEASYSDA']

Unnamed: 0,Info_epitope_id,Info_sourceOrg_id,Info_protein_id,Info_host_id,Info_n_Positive,Info_n_Negative,Info_TSeq_accver,Info_TSeq_taxid,Info_TSeq_orgname,Info_center_pos,Info_window_seq,Class
12450,852162,6282,A0A044RXP7,96069606,1,1,,6282,Onchocerca volvulus,58,KTHFEVQSEASYSDA,1
12594,856598,6282,A0A044RXP7,96069606,2,0,,6282,Onchocerca volvulus,58,KTHFEVQSEASYSDA,1
34144,852324,6282,A0A044UK00,96069606,0,2,,6282,Onchocerca volvulus,5454,KTHFEVQSEASYSDA,-1
41785,856067,6282,A0A044UK00,96069606,0,2,,6282,Onchocerca volvulus,5337,KTHFEVQSEASYSDA,-1
41800,856068,6282,A0A044UK00,96069606,2,0,,6282,Onchocerca volvulus,5571,KTHFEVQSEASYSDA,1
43618,856597,6282,A0A044UK00,96069606,1,1,,6282,Onchocerca volvulus,5454,KTHFEVQSEASYSDA,1
45750,857471,6282,A0A044UK00,96069606,0,2,,6282,Onchocerca volvulus,5337,KTHFEVQSEASYSDA,-1


array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])