## Feature Selection

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv('./datasets/ds.raw.csv')

y = df.Class
X = df.drop("Class" ,axis=1)

X = X.values
y = y.values

cw = compute_class_weight('balanced', np.unique(y), y)
cw_dic = {i:j for i,j in zip(np.unique(y), cw)}
scaler = StandardScaler()

clf = RandomForestClassifier(n_estimators=100, class_weight=cw_dic)
pipe = Pipeline([('Scaler', scaler),
                 ('Classifier', clf)])

In [None]:
cv = StratifiedKFold(n_splits=10, shuffle=True)

alg = RandomForestClassifier(n_estimators=100, class_weight=cw_dic)
resultados = np.empty((0, X.shape[1]))

for _ in range(3):
    for i_train, i_test in cv.split(X,y):
        X_train = X[i_train,:]
        X_test = X[i_test,:]
        y_train = y[i_train]
        y_test = y[i_test]

        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        alg.fit(X_train, y_train)
        resultados = np.vstack((resultados,alg.feature_importances_))
        

In [None]:
# para drug son 10 condiciones experimentales (c0 -> c9), para nanoparticulas 6 (c0 -> c5) 
feature_imp = np.mean(resultados,axis=0)
seleccion = np.array(df.columns[1:])[feature_imp >= np.mean(feature_imp)]
print(seleccion)

En Notas Humberto.txt pone que hay diez condiciones experimentales para los farmacos y 6 para las nanoparticulas, sin embargo no aparece en el dataset ds.raw.csv, solo hay hasta la condicion 6. Lo mismo ocurre con las nanoparticulas, falta la ultima condicion experimenta


In [None]:
# para drug son 10 condiciones experimentales (c0 -> c9), para nanoparticulas 6 (c0 -> c5)
if not 'prob' in seleccion:
    print('prob no presente en la selección, se va a añadir')
    seleccion = np.append(seleccion,'prob')

falta_np = []
# falta las condiciones experimentales 5
for i in range(5):
    for feature in seleccion:
        if feature.startswith('np') and feature.find('c' + str(i)) != -1:
            break
    else:
        falta_np.append('c' + str(i))

falta_d = []
# falta las condiciones experimentales 7,8,9.

for i in range(7):
    for feature in seleccion:
        if feature.startswith('d') and feature.find('c' + str(i)) != -1:
            break
    else:
        falta_d.append('c' + str(i))
print('En nanoparticulas faltan las condiciones experimentales: {}'.format(falta_np if falta_np else 'Ninguna'))
print('En farmacos faltan las condiciones experimentales: {}'.format(falta_d if falta_d else 'Ninguna' ))


In [None]:
for feat in falta_np:
    indices,nombre = zip(*[(indice, nombre) for indice, nombre in enumerate(df.columns[1:]) if feat in nombre and 'np' in nombre])
    i = np.argmax(feature_imp[np.array(indices)])
    seleccion = np.append(seleccion, nombre[i])


In [None]:
X = df[seleccion]
X = X.values
print('Los features seleccionados son: \n{}'.format(', '.join(seleccion)))

In [None]:
scores = cross_val_score(pipe, X, y, cv=cv, scoring='roc_auc')

In [None]:
print('Valor medio del test {:4.3f}: '.format(np.mean(scores)))
print(', '.join(map(str, np.round(scores,4))))