## Filter Methods - Basics


In [6]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold


### load dataset

In [None]:
# load the dataset from data folder 
# I load just a few rows for the demonstration
data = pd.read_pickle('../../data/features/features.pkl').sample(frac=0.35)
data.shape

In [None]:
data.head()

In [None]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

### remove constant features


In [None]:
constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0
]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

### remove quasi-constant features


In [None]:
sel = VarianceThreshold(
    threshold=0.01)  # 0.1 indicates 99% of observations approximately

sel.fit(X_train)  # fit finds the features with low variance

sum(sel.get_support()) # how many not quasi-constant?

In [None]:
features_to_keep = X_train.columns[sel.get_support()]

In [None]:
# we can then remove the features like this
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

In [None]:
# sklearn transformations lead to numpy arrays
# here I transform the arrays back to dataframes
# please be mindful of getting the columns assigned
# correctly

X_train= pd.DataFrame(X_train)
X_train.columns = features_to_keep

X_test= pd.DataFrame(X_test)
X_test.columns = features_to_keep

### remove duplicate features

In [None]:
# check for duplicated features in the training set
duplicated_feat = []
for i in tqdm(range(0, len(X_train.columns))):

    col_1 = X_train.columns[i]

    for col_2 in X_train.columns[i + 1:]:
        if X_train[col_1].equals(X_train[col_2]):
            duplicated_feat.append(col_2)
            
len(duplicated_feat)

In [None]:
X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
X_test.drop(labels=duplicated_feat, axis=1, inplace=True)

X_train.shape, X_test.shape

In [None]:
features_to_keep=X_train.columns.tolist()

### save features

In [None]:
np.save('../features/featuresFromBasicMethods.npy',features_to_keep)