# Feature selection - antimalaria Drug-Decorated Nanoparticles 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

Read the pool dataset with all the features for drugs and nanoparticles:

In [2]:
df = pd.read_csv('./NanoDrugsMalaria/datasets/ds.raw.csv')

y = df.Class
X = df.drop("Class" ,axis=1)

X = X.values
y = y.values

# define the classes weights
cw = compute_class_weight('balanced', np.unique(y), y)
cw_dic = {i:j for i,j in zip(np.unique(y), cw)}

# use Standard scaler before classification
scaler = StandardScaler()

# define a classifier as RF
clf = RandomForestClassifier(n_estimators=50, class_weight=cw_dic)

# create a pipeline: scaler + classifier
pipe = Pipeline([('Scaler', scaler),
                 ('Classifier', clf)])

In [None]:
# define 10-fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True)

# select Extra Trees classifier for feature selection
alg = ExtraTreesClassifier(n_estimators=100, class_weight=cw_dic)
resultados = np.empty((0, X.shape[1]))

for _ in range(3):
    for i_train, i_test in cv.split(X,y):
        X_train = X[i_train,:]
        X_test = X[i_test,:]
        y_train = y[i_train]
        y_test = y[i_test]

        # scale features
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        # select the features
        alg.fit(X_train, y_train)
        resultados = np.vstack((resultados,alg.feature_importances_))
        

Summary of the selected features:

In [None]:
# there are 7 experimental conditions for drugs (c0-c6) and 5 in the case of nanoparticles (c0-c4)
feature_imp = np.mean(resultados,axis=0)
seleccion = np.array(df.columns[1:])[feature_imp >= np.mean(feature_imp)]
print(seleccion)

The pure feature selection could exclude all the features for a specific experimental condition or the probability feature.
Therefore, we are creating a custom feature selection method that add at least one feature for each experimental condition and the probability if it is missing.

In [None]:
# there are 7 experimental conditions for drugs (c0-c6) and 5 in the case of nanoparticles (c0-c4)
if not 'prob' in seleccion:
    print('-> Probability is not presented in the selected features; it will be added to the list!')
    seleccion = np.append(seleccion,'prob')

falta_np = []
for i in range(5):
    for feature in seleccion:
        if feature.startswith('np') and feature.find('c' + str(i)) != -1:
            break
    else:
        falta_np.append('c' + str(i))

falta_d = []
for i in range(7):
    for feature in seleccion:
        if feature.startswith('d') and feature.find('c' + str(i)) != -1:
            break
    else:
        falta_d.append('c' + str(i))

print('* Checking missing experimental conditions in the automaticaly selected features:')
print('For nanoparticles these experimental conditions are missing: {}'.format(falta_np if falta_np else 'None'))
print('For drugs these experimental conditions are missing: {}'.format(falta_d if falta_d else 'None' ))

Append features for the missing experimental conditions:

In [None]:
for feat in falta_np:
    indices,nombre = zip(*[(indice, nombre) for indice, nombre in enumerate(df.columns[1:]) if feat in nombre and 'np' in nombre])
    i = np.argmax(feature_imp[np.array(indices)])
    seleccion = np.append(seleccion, nombre[i])

List the full set of selected features using our custom selection method:

In [None]:
X = df[seleccion]
X = X.values
print('Selected features: \n{}'.format(', '.join(seleccion)))

In [None]:
scores = cross_val_score(pipe, X, y, cv=cv, scoring='roc_auc')

In [9]:
print('Mean test AUC for 10-fold CV: {:4.3f}: '.format(np.mean(scores)))
print(scores)

Valor medio del test 0.992: 
[0.99043778 0.99234321 0.99263544 0.99206482 0.99072553 0.99168723
 0.99290171 0.99194402 0.99199546 0.99176663]
