# Seleção de _Features_
Este _notebook_ detalha a otimização dos _datasets_, utilizando o `SelectKBest` para selecionar as _features_ mais relevantes

In [1]:
import pandas as pd
import numpy as np

features = pd.read_csv("../Dataset/UCI HAR Dataset/UCI HAR Dataset/features.txt", header=None, sep="\s+")
X_train = pd.read_csv("../Dataset/UCI HAR Dataset/UCI HAR Dataset/train/X_train.txt", header=None, sep="\s+")
y_train = pd.read_csv("../Dataset/UCI HAR Dataset/UCI HAR Dataset/train/y_train.txt", header=None, sep="\s+")

y_train = np.reshape(y_train, (y_train.shape[0]))

X_train.shape, y_train.shape

((7352, 561), (7352,))

## Seleção das _features_ a partir dos _datasets_ de treino
### Usando a função de seleção `f_regression`

In [8]:
from sklearn.feature_selection import SelectKBest, f_regression

k = 60

selector = SelectKBest(score_func=f_regression, k=k)
X_selecionado = selector.fit_transform(X_train, y_train)

indices_selecionados = np.argsort(selector.scores_)[::-1][:k]
features_selecionadas = X_train.columns[indices_selecionados]
resultado = features.loc[features[0].isin(features_selecionadas)]

resultado

Unnamed: 0,0,1
3,4,tBodyAcc-std()-X
6,7,tBodyAcc-mad()-X
7,8,tBodyAcc-mad()-Y
14,15,tBodyAcc-min()-Z
19,20,tBodyAcc-iqr()-X
20,21,tBodyAcc-iqr()-Y
83,84,tBodyAccJerk-std()-X
86,87,tBodyAccJerk-mad()-X
91,92,tBodyAccJerk-max()-Z
92,93,tBodyAccJerk-min()-X


### Usando a função de seleção `r_regression`

In [9]:
from sklearn.feature_selection import r_regression

k = 60

selector = SelectKBest(score_func=r_regression, k=k)
X_selecionado = selector.fit_transform(X_train, y_train)

indices_selecionados = np.argsort(selector.scores_)[::-1][:k]
features_selecionadas = X_train.columns[indices_selecionados]
resultado = features.loc[features[0].isin(features_selecionadas)]

resultado

Unnamed: 0,0,1
11,12,tBodyAcc-max()-Z
12,13,tBodyAcc-min()-X
13,14,tBodyAcc-min()-Y
24,25,tBodyAcc-entropy()-Z
26,27,"tBodyAcc-arCoeff()-X,2"
28,29,"tBodyAcc-arCoeff()-X,4"
32,33,"tBodyAcc-arCoeff()-Y,4"
34,35,"tBodyAcc-arCoeff()-Z,2"
36,37,"tBodyAcc-arCoeff()-Z,4"
40,41,tGravityAcc-mean()-X
