In [1]:
# import basic python libraries
import numpy as np
import pandas as pd

# load sequence data
sequence_data = pd.read_csv('Waltz_and_AAIndex1_Data_Filtered')

features = sequence_data.drop(['Classification', 'Sequence'], axis = 1)
targets = sequence_data['Classification']

In [2]:
# remove orthogonal vectors and save them to a dataframe called removed
removed = pd.DataFrame()
for i in range(0, 6):
    for j in range(0, 20):
        removed['pos' + str(i) + '_orth_' + str(j)] = features['pos' + str(i) + '_orth_' + str(j)]
        features = features.drop(['pos' + str(i) + '_orth_' + str(j)], axis = 1)

In [3]:
# get list of unique property indexes
properties = set([x[5:] for x in features])

# take the mean, max, min and range of each property for each hexapeptide
for index in properties:
    six_features = [features['pos' + str(x) + '_' + index] for x in range(0, 6)]
    features[index + '_mean'] = np.mean(six_features, axis = 0)
    features[index + '_max'] = np.max(six_features, axis = 0)
    features[index + '_min'] = np.min(six_features, axis = 0)
    features[index + '_range'] = features[index + '_max'] - features[index + '_min']
    features = features.drop(['pos' + str(x) + '_' + index for x in range(0, 6)], axis = 1)

In [4]:
# import feature selection libraries
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# calculate 20 best features
selected_features = SelectKBest(mutual_info_classif, k = 50)
selected_features.fit(features, targets)

SelectKBest(k=50,
      score_func=<function mutual_info_classif at 0x00000189BC66DA60>)

In [5]:
# list features in order of importance
features_and_scores = list(zip(features.columns.values, list(selected_features.scores_)))
features_and_scores = sorted(features_and_scores, key = lambda x: x[1], reverse = True)
features_and_scores

[('GUYH850101_mean', 0.12307204858446874),
 ('BASU050102_mean', 0.12295275774895464),
 ('ROBB760106_mean', 0.11833745124466155),
 ('KRIW790101_mean', 0.11514571428045994),
 ('BULH740101_mean', 0.11405084330063486),
 ('CORJ870108_mean', 0.112115649769015),
 ('HOPT810101_mean', 0.1120186734424502),
 ('CORJ870101_mean', 0.11092770792366791),
 ('MIYS990105_mean', 0.11087195451529785),
 ('WERD780101_mean', 0.11037251805127912),
 ('MIYS990104_mean', 0.11001543823228377),
 ('ZHOH040101_mean', 0.10933676954054894),
 ('EISD860101_mean', 0.10904564167207642),
 ('FUKS010102_mean', 0.10879412733299376),
 ('ROSG850102_mean', 0.10840861575257099),
 ('KANM800102_mean', 0.1079113130016196),
 ('ZHOH040103_mean', 0.10469802079199342),
 ('CORJ870103_mean', 0.10435957762328685),
 ('PRAM900101_range', 0.10278855385790187),
 ('MIYS990102_mean', 0.10201937037051412),
 ('PONP930101_mean', 0.10194663342407506),
 ('CASG920101_mean', 0.1018250494512134),
 ('BASU050101_mean', 0.10122290399984357),
 ('CORJ870106_r

In [7]:
chosen_Best = ['VENT840101_mean', 'ROBB760106_mean',  'BASU050102_mean', 'GUYH850101_mean',
          'CASG920101_mean', 'CORJ870108_mean',  'KRIW790101_mean', 'CORJ870101_mean',  
          'BULH740101_mean', 'PONP800101_mean',  'KANM800102_mean', 'ZHOH040103_mean',  
          'ZHOH040101_mean', 'ENGD860101_range', 'FUKS010102_mean', 'RACS770103_mean',  
          'MIYS990104_mean', 'CORJ870103_mean',  'GEIM800107_mean', 'LEVM780106_mean', 
         ]

chosen_Waltz = ['pos3_NOZY710101', 'pos4_NOZY710101', 'pos4_VASM830103', 'pos1_PALJ810104', 
          'pos2_PALJ810104', 'pos0_CHOP780206', 'pos1_ROBB760102', 'pos0_GEIM800107', 
          'pos1_GEIM800107', 'pos2_GEIM800107', 'pos2_GARJ730101', 'pos0_FAUJ880110', 
          'pos1_FAUJ880110', 'pos2_FAUJ880110', 'pos3_FAUJ880110', 'pos4_FAUJ880110',
          'pos0_VENT840101', 'pos0_RACS820114', 'pos2_RACS820114', 'pos4_RACS820114', 
          'pos3_ONEK900102', 'pos4_ONEK900102', 'pos5_ONEK900102', 'pos1_PTIO830102', 
          'pos2_PTIO830102', 'pos5_FINA910102', 'pos5_MAXF760104', 'pos0_ZIMJ680103', 
          'pos5_ZIMJ680103', 'pos1_QIAN880123', 'pos5_AURR980106', 'pos0_FINA910102', 
          'pos1_FINA910102', 'pos2_FINA910102', 'pos3_FINA910102', 'pos4_FINA910102'
         ]

# filter out unwanted columns so only 20 best remain
chosen_features = features[chosen_Best.append(chosen_Waltz)]

# add back orthogonal vectors and classifications
chosen_features = removed.join(chosen_features)
chosen_features = pd.DataFrame(targets).join(chosen_features)

ValueError: cannot label index with a null key

In [None]:
# save to new .csv file
chosen_features.to_csv('Waltz_and_AAIndex1_Data_Filtered_Best_Features', sep = ',', index = False)