In [1]:
# import basic python libraries
import numpy as np
import pandas as pd

# load sequence data
sequence_data = pd.read_csv('Waltz_and_AAIndex1_Data_Filtered')

features = sequence_data.drop(['Classification', 'Sequence'], axis = 1)
targets = sequence_data['Classification']

In [2]:
# remove orthogonal vectors and save them to a dataframe called removed
removed = pd.DataFrame()
for i in range(0, 6):
    for j in range(0, 20):
        removed['pos' + str(i) + '_orth_' + str(j)] = features['pos' + str(i) + '_orth_' + str(j)]
        features = features.drop(['pos' + str(i) + '_orth_' + str(j)], axis = 1)

In [3]:
# get list of unique property indexes
properties = set([x[5:] for x in features])

# take the mean, max, min and range of each property for each hexapeptide
for index in properties:
    six_features = [features['pos' + str(x) + '_' + index] for x in range(0, 6)]
    features[index + '_mean'] = np.mean(six_features, axis = 0)
    features[index + '_max'] = np.max(six_features, axis = 0)
    features[index + '_min'] = np.min(six_features, axis = 0)
    features[index + '_range'] = features[index + '_max'] - features[index + '_min']
    features = features.drop(['pos' + str(x) + '_' + index for x in range(0, 6)], axis = 1)

In [4]:
# import feature selection libraries
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# calculate 20 best features
selected_features = SelectKBest(mutual_info_classif, k = 50)
selected_features.fit(features, targets)

SelectKBest(k=50,
      score_func=<function mutual_info_classif at 0x0000028282291488>)

In [5]:
# list features in order of importance
features_and_scores = list(zip(features.columns.values, list(selected_features.scores_)))
features_and_scores = sorted(features_and_scores, key = lambda x: x[1], reverse = True)
features_and_scores

[('BASU050102_mean', 0.12266488490257643),
 ('ROBB760106_mean', 0.12044400113392273),
 ('KRIW790101_mean', 0.11587806183599003),
 ('CORJ870108_mean', 0.11177882755099433),
 ('JACR890101_range', 0.11122659883469077),
 ('CORJ870101_mean', 0.11111514385607535),
 ('BULH740101_mean', 0.11099566458142318),
 ('KRIW790102_mean', 0.11064620907290079),
 ('HOPT810101_mean', 0.10900668228797694),
 ('QIAN880120_mean', 0.10743898149199937),
 ('CORJ870103_mean', 0.10632347729667058),
 ('PONP930101_mean', 0.10626715999428749),
 ('CORJ870104_mean', 0.10612032645143743),
 ('FUKS010102_mean', 0.1061116263558497),
 ('CASG920101_mean', 0.10595383689796023),
 ('ZHOH040103_mean', 0.10546854999386013),
 ('WERD780101_mean', 0.10513866871730015),
 ('MIYS990104_mean', 0.10494962662600282),
 ('GEIM800107_mean', 0.10460343421844098),
 ('GUYH850101_mean', 0.10456930394692132),
 ('ZHOH040101_mean', 0.10398999418649457),
 ('LEVM780106_mean', 0.10338599308671226),
 ('CORJ870106_mean', 0.10196220127447053),
 ('MUNV9401

In [6]:
chosen = ['VENT840101_mean', 'ROBB760106_mean',  'BASU050102_mean', 'GUYH850101_mean',
          'CASG920101_mean', 'CORJ870108_mean',  'KRIW790101_mean', 'CORJ870101_mean',  
          'BULH740101_mean', 'PONP800101_mean',  'KANM800102_mean', 'ZHOH040103_mean',  
          'ZHOH040101_mean', 'ENGD860101_range', 'FUKS010102_mean', 'RACS770103_mean',  
          'MIYS990104_mean', 'CORJ870103_mean',  'GEIM800107_mean', 'LEVM780106_mean', 
         ]

# filter out unwanted columns so only 20 best remain
chosen_features = features[chosen]

# add back orthogonal vectors and classifications
chosen_features = removed.join(chosen_features)
chosen_features = pd.DataFrame(targets).join(chosen_features)

In [7]:
# save to new .csv file
chosen_features.to_csv('Waltz_and_AAIndex1_Data_Filtered_Best_Features', sep = ',', index = False)