In [1]:
# import libraries
import re
import pandas as pd
import csv
import numpy as np

In [2]:
# load waltz data
waltz_data = pd.read_csv('waltzdb_export.csv', usecols = ['Sequence', 'Classification'])

# load AAIndex1 text
with open('AAIndex1.txt') as file:
    AAIndex1 = file.readlines()

In [3]:
# create lists of ...
headers = []
properties = []

# create list of 20 natural amino acids, in the same 
# order they appear in AAIndex1
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

In [4]:
# create new columns and fill with NaN
for i in range(0, 6):
    waltz_data['pos' + str(i)] = np.nan

# assign peptide abbreviations to each column, e.g. 'pos0' = 'A'
for i in range(0, len(waltz_data)):
    for j in range(0, len(waltz_data.loc[i, 'Sequence'])):
        sequence = waltz_data.loc[i, 'Sequence']
        waltz_data.loc[i, 'pos' + str(j)] = sequence[j]
waltz_data

Unnamed: 0,Classification,Sequence,pos0,pos1,pos2,pos3,pos4,pos5
0,non-amyloid,AAAQAA,A,A,A,Q,A,A
1,non-amyloid,AAELRN,A,A,E,L,R,N
2,non-amyloid,AAIDWF,A,A,I,D,W,F
3,non-amyloid,AAIGWG,A,A,I,G,W,G
4,non-amyloid,AALQSS,A,A,L,Q,S,S
5,non-amyloid,AAPKPK,A,A,P,K,P,K
6,non-amyloid,AAQAAL,A,A,Q,A,A,L
7,non-amyloid,AARRFF,A,A,R,R,F,F
8,non-amyloid,AAVDQT,A,A,V,D,Q,T
9,non-amyloid,ACGVIG,A,C,G,V,I,G


In [5]:
# define dictionary of binary amino acid abbreviation mappings.
amino_dict = {
    'A' : '1000000000000000000000',
    'B' : '0100000000000000000000',
    'C' : '0010000000000000000000',
    'D' : '0001000000000000000000',
    'E' : '0000100000000000000000',
    'F' : '0000010000000000000000',
    'G' : '0000001000000000000000',
    'H' : '0000000100000000000000',
    'I' : '0000000010000000000000',
    'K' : '0000000001000000000000',
    'L' : '0000000000100000000000',
    'M' : '0000000000010000000000',
    'N' : '0000000000001000000000',
    'P' : '0000000000000100000000',
    'Q' : '0000000000000010000000',
    'R' : '0000000000000001000000',
    'S' : '0000000000000000100000',
    'T' : '0000000000000000010000',
    'V' : '0000000000000000001000',
    'W' : '0000000000000000000100',
    'Y' : '0000000000000000000010',
    'Z' : '0000000000000000000001'
}

# define a function to convert from abbreviation to binary string.
def abbrev_to_binary(str):
    binary_list = ''
    for s in str:
        binary_list += amino_dict[s]
    return binary_list;

In [6]:
# create new columns and fill with NaN
for i in range(0, 6):
    waltz_data['pos' + str(i) + '_orth'] = np.nan
    
# assign peptide abbreviations to each column, e.g. 'pos0' = 'A'
for i in range(0, len(waltz_data)):
    for j in range(0, len(waltz_data.loc[i, 'Sequence'])):
        sequence = waltz_data.loc[i, 'Sequence']
        orth_vector = abbrev_to_binary(sequence[j])
        waltz_data.loc[i, 'pos' + str(j) + '_orth'] = orth_vector
waltz_data

Unnamed: 0,Classification,Sequence,pos0,pos1,pos2,pos3,pos4,pos5,pos0_orth,pos1_orth,pos2_orth,pos3_orth,pos4_orth,pos5_orth
0,non-amyloid,AAAQAA,A,A,A,Q,A,A,1000000000000000000000,1000000000000000000000,1000000000000000000000,0000000000000010000000,1000000000000000000000,1000000000000000000000
1,non-amyloid,AAELRN,A,A,E,L,R,N,1000000000000000000000,1000000000000000000000,0000100000000000000000,0000000000100000000000,0000000000000001000000,0000000000001000000000
2,non-amyloid,AAIDWF,A,A,I,D,W,F,1000000000000000000000,1000000000000000000000,0000000010000000000000,0001000000000000000000,0000000000000000000100,0000010000000000000000
3,non-amyloid,AAIGWG,A,A,I,G,W,G,1000000000000000000000,1000000000000000000000,0000000010000000000000,0000001000000000000000,0000000000000000000100,0000001000000000000000
4,non-amyloid,AALQSS,A,A,L,Q,S,S,1000000000000000000000,1000000000000000000000,0000000000100000000000,0000000000000010000000,0000000000000000100000,0000000000000000100000
5,non-amyloid,AAPKPK,A,A,P,K,P,K,1000000000000000000000,1000000000000000000000,0000000000000100000000,0000000001000000000000,0000000000000100000000,0000000001000000000000
6,non-amyloid,AAQAAL,A,A,Q,A,A,L,1000000000000000000000,1000000000000000000000,0000000000000010000000,1000000000000000000000,1000000000000000000000,0000000000100000000000
7,non-amyloid,AARRFF,A,A,R,R,F,F,1000000000000000000000,1000000000000000000000,0000000000000001000000,0000000000000001000000,0000010000000000000000,0000010000000000000000
8,non-amyloid,AAVDQT,A,A,V,D,Q,T,1000000000000000000000,1000000000000000000000,0000000000000000001000,0001000000000000000000,0000000000000010000000,0000000000000000010000
9,non-amyloid,ACGVIG,A,C,G,V,I,G,1000000000000000000000,0010000000000000000000,0000001000000000000000,0000000000000000001000,0000000010000000000000,0000001000000000000000


In [7]:
# define regular expression for float
float_regex = '(-?[0-9]+.?[0-9]*)'

# read through AAIndex, and accumulate lists of headers and contents
for i in range (0, len(AAIndex1)):
    # each property header begins with H
    if re.match('H', AAIndex1[i]):
        headers.append(AAIndex1[i][2:-1])
    # each property content begins with I
    if re.match('I', AAIndex1[i]):
        properties.append(re.findall(float_regex, AAIndex1[i + 1]) + re.findall(float_regex, AAIndex1[i + 2]))

# ensure that all content values are floats
for property_values in properties:
    for property_value in property_values:
        property_value = float(property_value)

In [8]:
# create list of property dictionaries
property_dict_list = []

# loop through all property values and create a dictionary for each amino acid, e.g. {'A' : '4.05'}
for property_values in properties:
    property_dict = dict(zip(amino_acids, property_values))
    # add dictionary to list of property dictionaries
    property_dict_list.append(property_dict)

# create a dictionary of dictionaries, to link headers with property dictionaries, e.g. {'AURR980108' : {'A' : '4.05', 'C' : '0.52', ...}}
properties = dict(zip(headers, property_dict_list))

In [9]:
# find properties that don't have values for all 20 amino acids
incomplete_list = []
for header in properties:
    if (len(properties[header]) != 20):
        incomplete_list.append(header)
        
# remove properties that don't have values for all 20 amino acids
for header in incomplete_list:
    del properties[header]

In [10]:
# select columns of interest from waltz_data
chosen_headers = ['Sequence', 'Classification']
listofLists = waltz_data[chosen_headers].values.tolist()

# create one header for each property in each hexapeptide position
AAIndex_data = []
final_headers = chosen_headers
for property in properties:
    for i in range(1, 7):
        # add on to list of headers, e.g. pos2_ARGP820101
        final_headers.append('pos' + str(i) + '_' + property)

# add headers to final AAIndex data
AAIndex_data.append(final_headers)

In [11]:
# create new columns for each property for each hexapeptide position
for property in properties:
    for i in range(0, 6):
        # add on to list of headers, e.g. pos2_ARGP820101
        waltz_data['pos' + str(i) + '_' + property] = np.nan
        
waltz_data

Unnamed: 0,Classification,Sequence,pos0,pos1,pos2,pos3,pos4,pos5,pos0_orth,pos1_orth,...,pos2_KARS160121,pos3_KARS160121,pos4_KARS160121,pos5_KARS160121,pos0_KARS160122,pos1_KARS160122,pos2_KARS160122,pos3_KARS160122,pos4_KARS160122,pos5_KARS160122
0,non-amyloid,AAAQAA,A,A,A,Q,A,A,1000000000000000000000,1000000000000000000000,...,,,,,,,,,,
1,non-amyloid,AAELRN,A,A,E,L,R,N,1000000000000000000000,1000000000000000000000,...,,,,,,,,,,
2,non-amyloid,AAIDWF,A,A,I,D,W,F,1000000000000000000000,1000000000000000000000,...,,,,,,,,,,
3,non-amyloid,AAIGWG,A,A,I,G,W,G,1000000000000000000000,1000000000000000000000,...,,,,,,,,,,
4,non-amyloid,AALQSS,A,A,L,Q,S,S,1000000000000000000000,1000000000000000000000,...,,,,,,,,,,
5,non-amyloid,AAPKPK,A,A,P,K,P,K,1000000000000000000000,1000000000000000000000,...,,,,,,,,,,
6,non-amyloid,AAQAAL,A,A,Q,A,A,L,1000000000000000000000,1000000000000000000000,...,,,,,,,,,,
7,non-amyloid,AARRFF,A,A,R,R,F,F,1000000000000000000000,1000000000000000000000,...,,,,,,,,,,
8,non-amyloid,AAVDQT,A,A,V,D,Q,T,1000000000000000000000,1000000000000000000000,...,,,,,,,,,,
9,non-amyloid,ACGVIG,A,C,G,V,I,G,1000000000000000000000,0010000000000000000000,...,,,,,,,,,,


In [12]:
# assign properties to each column by looking it up in dictionary of dictionaries
for property in properties:
    for i in range(0, len(waltz_data)):
        for j in range(0, len(waltz_data.loc[i, 'Sequence'])):
            property_value = properties.get(property).get(waltz_data.get_value(i, 'pos' + str(j)))
            waltz_data.loc[i, 'pos' + str(j) + '_' + property] = property_value
waltz_data

Unnamed: 0,Classification,Sequence,pos0,pos1,pos2,pos3,pos4,pos5,pos0_orth,pos1_orth,...,pos2_KARS160121,pos3_KARS160121,pos4_KARS160121,pos5_KARS160121,pos0_KARS160122,pos1_KARS160122,pos2_KARS160122,pos3_KARS160122,pos4_KARS160122,pos5_KARS160122
0,non-amyloid,AAAQAA,A,A,A,Q,A,A,1000000000000000000000,1000000000000000000000,...,6.00,10.50,6.00,6.00,0.00,0.00,0.00,1.849,0.00,0.00
1,non-amyloid,AAELRN,A,A,E,L,R,N,1000000000000000000000,1000000000000000000000,...,10.667,9.60,10.667,10.00,0.00,0.00,1.822,3.113,4.20,3.00
2,non-amyloid,AAIDWF,A,A,I,D,W,F,1000000000000000000000,1000000000000000000000,...,9.60,10.40,12.75,12.00,0.00,0.00,3.373,2.969,2.044,2.026
3,non-amyloid,AAIGWG,A,A,I,G,W,G,1000000000000000000000,1000000000000000000000,...,9.60,3.50,12.75,3.50,0.00,0.00,3.373,0.00,2.044,0.00
4,non-amyloid,AALQSS,A,A,L,Q,S,S,1000000000000000000000,1000000000000000000000,...,9.60,10.50,8.667,8.667,0.00,0.00,3.113,1.849,6.00,6.00
5,non-amyloid,AAPKPK,A,A,P,K,P,K,1000000000000000000000,1000000000000000000000,...,12.00,10.167,12.00,10.167,0.00,0.00,12.00,1.372,12.00,1.372
6,non-amyloid,AAQAAL,A,A,Q,A,A,L,1000000000000000000000,1000000000000000000000,...,10.50,6.00,6.00,9.60,0.00,0.00,1.849,0.00,0.00,3.113
7,non-amyloid,AARRFF,A,A,R,R,F,F,1000000000000000000000,1000000000000000000000,...,10.667,10.667,12.00,12.00,0.00,0.00,4.20,4.20,2.026,2.026
8,non-amyloid,AAVDQT,A,A,V,D,Q,T,1000000000000000000000,1000000000000000000000,...,9.00,10.40,10.50,9.00,0.00,0.00,6.00,2.969,1.849,6.00
9,non-amyloid,ACGVIG,A,C,G,V,I,G,1000000000000000000000,0010000000000000000000,...,3.50,9.00,9.60,3.50,0.00,6.00,0.00,6.00,3.373,0.00


In [13]:
waltz_data.to_csv('Waltz_and_AAIndex1', sep = ',', index = False)