# FEATURE SELECTION

***

# FILTER METHODS

In [1]:
#libs

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold


## REMOVING CONSTANTE FEATURES

In [2]:
df1 = pd.read_csv('dataset_1.csv')
df1.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_292,var_293,var_294,var_295,var_296,var_297,var_298,var_299,var_300,target
0,0,0,0.0,0.0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
1,0,0,0.0,3.0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
2,0,0,0.0,5.88,0.0,0,0,0,0,0,...,0.0,0,0,3,0,0,0,0.0,67772.7216,0
3,0,0,0.0,14.1,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
4,0,0,0.0,5.76,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0


In [3]:
df1.shape

(50000, 301)

In all feature selection procedures, it is good practice to select the features by examining only the training set. And this is to avoid overfitting.

In [4]:

X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(labels=['target'], axis = 1),
    df1['target'],
    test_size = 0.3,
    random_state = 0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

## Using VarianceThreshold from Scikit-learn

The VarianceThreshold from sklearn provides a simple baseline approach to feature selection. It removes all features which variance doesn’t meet a certain threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

In [5]:
sel = VarianceThreshold(threshold = 0)
sel.fit(X_train) #fit finds the features with zero variance

VarianceThreshold(threshold=0)

If we sum over get_support, we get the number of features that are not constant

In [6]:
sum(sel.get_support())

266

In [7]:
X_train.shape[1] - sum(sel.get_support()) #constante features

34

In [8]:
# getting constant variables names
constant = X_train.columns[~sel.get_support()]
constant

Index(['var_23', 'var_33', 'var_44', 'var_61', 'var_80', 'var_81', 'var_87',
       'var_89', 'var_92', 'var_97', 'var_99', 'var_112', 'var_113', 'var_120',
       'var_122', 'var_127', 'var_135', 'var_158', 'var_167', 'var_170',
       'var_171', 'var_178', 'var_180', 'var_182', 'var_195', 'var_196',
       'var_201', 'var_212', 'var_215', 'var_225', 'var_227', 'var_248',
       'var_294', 'var_297'],
      dtype='object')

In [9]:
# checking
for c in constant:
    print(c, X_train[c].unique())

var_23 [0]
var_33 [0]
var_44 [0]
var_61 [0]
var_80 [0]
var_81 [0]
var_87 [0]
var_89 [0.]
var_92 [0]
var_97 [0]
var_99 [0]
var_112 [0]
var_113 [0]
var_120 [0]
var_122 [0]
var_127 [0]
var_135 [0]
var_158 [0]
var_167 [0]
var_170 [0]
var_171 [0]
var_178 [0.]
var_180 [0.]
var_182 [0]
var_195 [0]
var_196 [0]
var_201 [0]
var_212 [0]
var_215 [0]
var_225 [0]
var_227 [0.]
var_248 [0]
var_294 [0]
var_297 [0]


In [10]:
# non-constant feature names
non_const = X_train.columns[sel.get_support()]
non_const

Index(['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7', 'var_8',
       'var_9', 'var_10',
       ...
       'var_289', 'var_290', 'var_291', 'var_292', 'var_293', 'var_295',
       'var_296', 'var_298', 'var_299', 'var_300'],
      dtype='object', length=266)

In [11]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((35000, 266), (15000, 266))

In [12]:
#dataframe
X_train = pd.DataFrame(X_train, columns = non_const)
X_train.sample(3)


Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_289,var_290,var_291,var_292,var_293,var_295,var_296,var_298,var_299,var_300
19269,0.0,0.0,0.0,2.97,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33584,0.0,0.0,0.0,10.92,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0
14858,0.0,0.0,0.0,2.85,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## STD from PANDAS

In [13]:
#train and test
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(labels=['target'], axis=1),
    df1['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [14]:
# std 
const_features = [f for f in X_train.columns if X_train[f].std() == 0]

# len
len(const_features)


34

In [15]:
# drops these variables from our datasets
X_train = X_train.drop(labels = const_features, axis = 1)
X_test = X_test.drop(labels = const_features, axis = 1)

print(X_train.shape, X_test.shape)

(35000, 266) (15000, 266)


## MANUAL CODE - CATEGORICAL VARIABLES

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(labels=['target'], axis=1),
    df1['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [17]:
# all features to object
X_train = X_train.astype('O')
X_train.dtypes

var_1      object
var_2      object
var_3      object
var_4      object
var_5      object
            ...  
var_296    object
var_297    object
var_298    object
var_299    object
var_300    object
Length: 300, dtype: object

In [19]:
#using nunique from pandas
const_features = [f for f in X_train.columns if X_train[f].nunique() == 1]

len(const_features)

34

Note by default nunique() ignores missing values, so if your variables have missing values, use dropna=False within the parameters of nunique().

In [20]:
X_train.drop(labels=const_features, axis=1, inplace=True)
X_test.drop(labels=const_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 266), (15000, 266))