In [9]:
# import basic python libraries
import numpy as np
import pandas as pd

# load waltz data
waltz_data = pd.read_csv('Waltz_Data_Filtered')

# load AAIndex1 text
with open('AAIndex1.txt') as file:
    AAIndex1 = file.readlines()

In [2]:
# create lists of ...
headers = []
properties = []

# create list of 20 natural amino acids, in the same order they appear in AAIndex1
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

In [4]:
# import regular expressions library
import re

# define regular expression for float
float_regex = '(-?[0-9]+.?[0-9]*)'

# read through AAIndex, and accumulate lists of headers and contents
for i in range (0, len(AAIndex1)):
    # each property header begins with H
    if re.match('H', AAIndex1[i]):
        headers.append(AAIndex1[i][2:-1])
    # each property content begins with I
    if re.match('I', AAIndex1[i]):
        properties.append(re.findall(float_regex, AAIndex1[i + 1]) + re.findall(float_regex, AAIndex1[i + 2]))

# ensure that all content values are floats
for property_values in properties:
    for property_value in property_values:
        property_value = float(property_value)

In [5]:
# create list of property dictionaries
property_dict_list = []

# loop through all property values and create a dictionary for each amino acid, e.g. {'A' : '4.05'}
for property_values in properties:
    property_dict = dict(zip(amino_acids, property_values))
    # add dictionary to list of property dictionaries
    property_dict_list.append(property_dict)

# create a dictionary of dictionaries, to link headers with property dictionaries, e.g. {'AURR980108' : {'A' : '4.05', 'C' : '0.52', ...}}
properties = dict(zip(headers, property_dict_list))

In [6]:
# find properties that don't have values for all 20 amino acids
incomplete_list = []
for header in properties:
    if (len(properties[header]) != 20):
        incomplete_list.append(header)
        
# remove properties that don't have values for all 20 amino acids
for header in incomplete_list:
    del properties[header]

In [7]:
# select columns of interest from waltz_data
chosen_headers = ['Sequence', 'Classification']
listofLists = waltz_data[chosen_headers].values.tolist()

# create one header for each property in each hexapeptide position
AAIndex_data = []
final_headers = chosen_headers
for property in properties:
    for i in range(1, 7):
        # add on to list of headers, e.g. pos2_ARGP820101
        final_headers.append('pos' + str(i) + '_' + property)

# add headers to final AAIndex data
AAIndex_data.append(final_headers)

In [11]:
# create new columns for each property for each hexapeptide position
for property in properties:
    for i in range(0, 6):
        # add on to list of headers, e.g. pos2_ARGP820101
        waltz_data['pos' + str(i) + '_' + property] = np.nan

In [13]:
# assign properties to each column by looking it up in dictionary of dictionaries
for property in properties:
    for i in range(0, len(waltz_data)):
        for j in range(0, len(waltz_data.loc[i, 'Sequence'])):
            property_value = properties.get(property).get(waltz_data.get_value(i, 'Sequence')[j])
            waltz_data.loc[i, 'pos' + str(j) + '_' + property] = property_value

Unnamed: 0,Classification,Sequence,pos0_orth_0,pos0_orth_1,pos0_orth_2,pos0_orth_3,pos0_orth_4,pos0_orth_5,pos0_orth_6,pos0_orth_7,...,pos2_NAKH900105,pos3_NAKH900105,pos4_NAKH900105,pos5_NAKH900105,pos0_MIYS990105,pos1_MIYS990105,pos2_MIYS990105,pos3_MIYS990105,pos4_MIYS990105,pos5_MIYS990105
0,0,AAAQAA,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.88,2.30,5.88,5.88,-0.02,-0.02,-0.02,0.15,-0.02,-0.02
1,0,AAELRN,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.60,16.52,1.54,4.38,-0.02,-0.02,0.21,-0.32,0.08,0.10
2,0,AAIDWF,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.78,1.70,2.89,6.58,-0.02,-0.02,-0.28,0.19,-0.27,-0.33
3,0,AALQSS,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,16.52,2.30,7.68,7.68,-0.02,-0.02,-0.32,0.15,0.11,0.11
4,0,AAPKPK,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.29,2.58,5.29,2.58,-0.02,-0.02,0.11,0.30,0.11,0.30
5,0,AAQAAL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.30,5.88,5.88,16.52,-0.02,-0.02,0.15,-0.02,-0.02,-0.32
6,0,AARRFF,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.54,1.54,6.58,6.58,-0.02,-0.02,0.08,0.08,-0.33,-0.33
7,0,AAVDQT,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.66,1.70,2.30,8.38,-0.02,-0.02,-0.23,0.19,0.15,0.05
8,0,ACGVIG,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.29,4.66,8.78,5.29,-0.02,-0.32,-0.02,-0.23,-0.28,-0.02
9,0,ADVGQG,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.66,5.29,2.30,5.29,-0.02,0.19,-0.23,-0.02,0.15,-0.02


In [14]:
waltz_data.to_csv('Waltz_and_AAIndex1_Data_Filtered', sep = ',', index = False)