# Basics:
* Constant features (only one value for all obs in the dataset)
* Quasi-constant features (pretty much most values are constant)
* Duplicated features (two features in the dataset show the same value for all obs. "Redundant features")
    * may arise after one hot encoding of categorical variables

### Getting rid of Constant Features


#### Three methods
* VarianceThreshold -sklearn
* std() - pandas
* nunique() - pandas

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold

In [2]:
# load our first dataset

data = pd.read_csv('precleaned-datasets/dataset_1.csv')
data.shape

(50000, 301)

In [4]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),  # drop the target
    data['target'],  # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [5]:
sel = VarianceThreshold(threshold=0)

sel.fit(X_train)  # fit finds the features with zero variance

VarianceThreshold(threshold=0)

The VarianceThreshold removes all features which variance doesn’t meet a certain threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

In [6]:
# get_support is a boolean vector that indicates which features are retained
#True: feature is constant
#False: feature is NOT constant
# if we sum over get_support, we get the number of features that are not constant

# (go ahead and print the result of sel.get_support() to understand its output)

sum(sel.get_support())

266

In [7]:
# now let's print the number of constant feautures

constant = X_train.columns[~sel.get_support()]

len(constant)

34

In [8]:
constant

Index(['var_23', 'var_33', 'var_44', 'var_61', 'var_80', 'var_81', 'var_87',
       'var_89', 'var_92', 'var_97', 'var_99', 'var_112', 'var_113', 'var_120',
       'var_122', 'var_127', 'var_135', 'var_158', 'var_167', 'var_170',
       'var_171', 'var_178', 'var_180', 'var_182', 'var_195', 'var_196',
       'var_201', 'var_212', 'var_215', 'var_225', 'var_227', 'var_248',
       'var_294', 'var_297'],
      dtype='object')

In [10]:
#Print unique values for each of these constant features
for col in constant:
    print(col, X_train[col].unique())

var_23 [0]
var_33 [0]
var_44 [0]
var_61 [0]
var_80 [0]
var_81 [0]
var_87 [0]
var_89 [0.]
var_92 [0]
var_97 [0]
var_99 [0]
var_112 [0]
var_113 [0]
var_120 [0]
var_122 [0]
var_127 [0]
var_135 [0]
var_158 [0]
var_167 [0]
var_170 [0]
var_171 [0]
var_178 [0.]
var_180 [0.]
var_182 [0]
var_195 [0]
var_196 [0]
var_201 [0]
var_212 [0]
var_215 [0]
var_225 [0]
var_227 [0.]
var_248 [0]
var_294 [0]
var_297 [0]


In [11]:
# capture non-constant feature names
feat_names = X_train.columns[sel.get_support()]

In [12]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((35000, 266), (15000, 266))

In [13]:
# reconstitute de dataframe

X_train = pd.DataFrame(X_train, columns=feat_names)
X_train.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_289,var_290,var_291,var_292,var_293,var_295,var_296,var_298,var_299,var_300
0,0.0,0.0,0.0,2.79,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,2.97,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.79,85435.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,5.7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Method 1: Only works with numerical

In [14]:
# separate train and test (again, as we transformed the previous ones)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [15]:
# short and easy: find constant features
# in this dataset, all features are numeric,
# so this bit of code will suffice:

constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0
]

len(constant_features)

34

In [16]:
# drop these columns from the train and test sets:
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 266), (15000, 266))

### Method 2: Also works with categorical variables

In [17]:
# separate train and test (again, as we transformed the previous ones)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [18]:
# cast all the numeric features as object, to simulate that they are categorical
X_train = X_train.astype('O')
X_train.dtypes

var_1      object
var_2      object
var_3      object
var_4      object
var_5      object
            ...  
var_296    object
var_297    object
var_298    object
var_299    object
var_300    object
Length: 300, dtype: object

In [19]:
# to find variables that contain only 1 label/value, use the nunique() method from pandas, 
# which returns the number of different values in a variable.

constant_features = [
    feat for feat in X_train.columns if X_train[feat].nunique() == 1
]

len(constant_features) #result is same as before - we found 34 features that are constant

34

In [20]:
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 266), (15000, 266))

### Getting rid of Quasi - Constant Features

#### Three methods
* VarianceThreshold -sklearn (change the threshold)
* value_counts() - pandas
    * numerical and categorical variables

In [22]:
#sel = VarianceThreshold(threshold=0.01) #change the threshold here, instead of 0
#everything else follows the same as before.

#### Value Counts method

In [21]:
# separate train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

# remove constant features
# using the code from the previous lecture

constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0
]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 266), (15000, 266))

In [23]:
# create an empty list
quasi_constant_feat = []

# iterate over every feature
for feature in X_train.columns:

    # find the predominant value, that is the value that is shared
    # by most observations
    predominant = X_train[feature].value_counts(
        normalize=True).sort_values(ascending=False).values[0]

    # evaluate the predominant feature: do more than 99% of the observations
    # show 1 value?
    if predominant > 0.998:

        # if yes, add the variable to the list
        quasi_constant_feat.append(feature)

len(quasi_constant_feat)

108

In [24]:
# Found that 108 features show predominantly 1 value for the majority of the observations

#Variable 3 for example had 0's for most of them.
X_train['var_3'].value_counts(normalize=True)


0.0000         0.999629
207901.3365    0.000029
15028.0560     0.000029
25905.4866     0.000029
35685.9459     0.000029
3583.3941      0.000029
52105.7901     0.000029
86718.0000     0.000029
861.0900       0.000029
2641.0164      0.000029
5209.9500      0.000029
10281.6000     0.000029
12542.3100     0.000029
27.3000        0.000029
Name: var_3, dtype: float64

In [25]:
# finally, let's drop the quasi-constant features:

X_train.drop(labels=quasi_constant_feat, axis=1, inplace=True)
X_test.drop(labels=quasi_constant_feat, axis=1, inplace=True)

X_train.shape, X_test.shape #Remaining number of features: 158

((35000, 158), (15000, 158))

### Getting rid of Duplicated Features


In [26]:
# check for duplicated features in the training set:

# create an empty dictionary, where we will store 
# the groups of duplicates
duplicated_feat_pairs = {}

# create an empty list to collect features
# that were found to be duplicated
_duplicated_feat = []


# iterate over every feature in our dataset:
for i in range(0, len(X_train.columns)):
    
    # this bit helps me understand where the loop is at:
    if i % 10 == 0:  
        print(i)
    
    # choose 1 feature:
    feat_1 = X_train.columns[i]
    
    # check if this feature has already been identified
    # as a duplicate of another one. If it was, it should be stored in
    # our _duplicated_feat list.
    
    # If this feature was already identified as a duplicate, we skip it, if
    # it has not yet been identified as a duplicate, then we proceed:
    if feat_1 not in _duplicated_feat:
    
        # create an empty list as an entry for this feature in the dictionary:
        duplicated_feat_pairs[feat_1] = []

        # now, iterate over the remaining features of the dataset:
        for feat_2 in X_train.columns[i + 1:]:

            # check if this second feature is identical to the first one
            if X_train[feat_1].equals(X_train[feat_2]):

                # if it is identical, append it to the list in the dictionary
                duplicated_feat_pairs[feat_1].append(feat_2)
                
                # and append it to our monitor list for duplicated variables
                _duplicated_feat.append(feat_2)
                
                # done!

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150


In [27]:
# let's explore our list of duplicated features
len(_duplicated_feat)

6

In [28]:
# these are the ones:

_duplicated_feat

['var_148', 'var_199', 'var_296', 'var_250', 'var_232', 'var_269']

In [29]:
# let's explore the dictionary we created:

duplicated_feat_pairs
#We see that for every feature, if it had duplicates, we have entries in the list, 
#otherwise, we have empty lists. Let's explore those features with duplicates now:



{'var_4': [],
 'var_5': [],
 'var_8': [],
 'var_13': [],
 'var_15': [],
 'var_17': [],
 'var_18': [],
 'var_19': [],
 'var_21': [],
 'var_22': [],
 'var_25': [],
 'var_26': [],
 'var_27': [],
 'var_29': [],
 'var_30': [],
 'var_31': [],
 'var_35': [],
 'var_37': ['var_148'],
 'var_38': [],
 'var_41': [],
 'var_46': [],
 'var_47': [],
 'var_49': [],
 'var_50': [],
 'var_51': [],
 'var_52': [],
 'var_54': [],
 'var_55': [],
 'var_57': [],
 'var_58': [],
 'var_62': [],
 'var_63': [],
 'var_64': [],
 'var_68': [],
 'var_70': [],
 'var_74': [],
 'var_75': [],
 'var_76': [],
 'var_79': [],
 'var_82': [],
 'var_83': [],
 'var_84': ['var_199'],
 'var_85': [],
 'var_86': [],
 'var_88': [],
 'var_91': [],
 'var_93': [],
 'var_94': [],
 'var_96': [],
 'var_100': [],
 'var_101': [],
 'var_103': [],
 'var_105': [],
 'var_107': [],
 'var_108': [],
 'var_109': [],
 'var_110': [],
 'var_114': [],
 'var_117': [],
 'var_118': [],
 'var_119': [],
 'var_121': [],
 'var_123': [],
 'var_128': [],
 'var_131'

In [30]:
# let's explore the number of keys in our dictionary

# we see it is 152, because 6 of the 158 were duplicates,
# so they were not included as keys

print(len(duplicated_feat_pairs.keys()))

152


In [31]:
# print the features with its duplicates

# iterate over every feature in our dict:
for feat in duplicated_feat_pairs.keys():
    
    # if it has duplicates, the list should not be empty:
    if len(duplicated_feat_pairs[feat]) > 0:

        # print the feature and its duplicates:
        print(feat, duplicated_feat_pairs[feat])
        print()

var_37 ['var_148']

var_84 ['var_199']

var_143 ['var_296']

var_177 ['var_250']

var_226 ['var_232']

var_229 ['var_269']



In [32]:
# let's check that indeed those features are duplicated
# I select a pair from above

X_train[['var_37', 'var_148']].head(10)

Unnamed: 0,var_37,var_148
17967,0,0
32391,0,0
9341,0,0
7929,0,0
46544,0,0
4149,0,0
33426,0,0
3002,0,0
6974,0,0
16864,0,0


In [33]:
X_train['var_37'].unique()

array([ 0,  3,  6,  9, 12, 21, 33, 15], dtype=int64)

In [34]:
X_train['var_148'].unique()

array([ 0,  3,  6,  9, 12, 21, 33, 15], dtype=int64)

In [35]:
# let's explore parts of the dataframe where the values in
# these features are different from 0:

X_train[X_train['var_37'] != 0][['var_37', 'var_148']].head(10)

Unnamed: 0,var_37,var_148
37493,3,3
20251,6,6
4264,6,6
48480,3,3
31607,3,3
41172,3,3
13502,3,3
7759,3,3
46118,3,3
2638,3,3


In [36]:
# finally, to remove the duplicates, what we are going to do is to retain
# the keys of the dictionary

# do you understand why? if not, go back to our loop in cell 7 and try to 
# determine the reason

X_train = X_train[duplicated_feat_pairs.keys()]
X_test = X_test[duplicated_feat_pairs.keys()]

X_train.shape, X_test.shape

((35000, 152), (15000, 152))

### Getting rid of Constant Features (using Feature_Engine)


In [17]:
from feature_engine.selection import DropDuplicateFeatures, DropConstantFeatures

In [39]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1), # drop the target
    data['target'], # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [40]:
sel = DropConstantFeatures(tol=1, variables=None, missing_values='raise')

sel.fit(X_train)

In [41]:
# list of constant features

sel.features_to_drop_

['var_23',
 'var_33',
 'var_44',
 'var_61',
 'var_80',
 'var_81',
 'var_87',
 'var_89',
 'var_92',
 'var_97',
 'var_99',
 'var_112',
 'var_113',
 'var_120',
 'var_122',
 'var_127',
 'var_135',
 'var_158',
 'var_167',
 'var_170',
 'var_171',
 'var_178',
 'var_180',
 'var_182',
 'var_195',
 'var_196',
 'var_201',
 'var_212',
 'var_215',
 'var_225',
 'var_227',
 'var_248',
 'var_294',
 'var_297']

In [42]:
# number of constant features

len(sel.features_to_drop_)

34

In [43]:
# remove constant features from the data

X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((35000, 266), (15000, 266))

### Getting rid of Quasi-Constant Features (using Feature_Engine)


In [44]:
sel = DropConstantFeatures(tol=0.998, variables=None, missing_values='raise')

sel.fit(X_train)

In [45]:
# number of quasi-constant features

len(sel.features_to_drop_)

108

In [46]:
# percentage of observations showing each of the different values
# of the variable

var = sel.features_to_drop_[0]

X_train[var].value_counts(normalize=True)

0    0.999629
3    0.000200
6    0.000143
9    0.000029
Name: var_1, dtype: float64

In [47]:
# let's explore another one

var = sel.features_to_drop_[2]

X_train[var].value_counts(normalize=True)

0.0000         0.999629
207901.3365    0.000029
15028.0560     0.000029
25905.4866     0.000029
35685.9459     0.000029
3583.3941      0.000029
52105.7901     0.000029
86718.0000     0.000029
861.0900       0.000029
2641.0164      0.000029
5209.9500      0.000029
10281.6000     0.000029
12542.3100     0.000029
27.3000        0.000029
Name: var_3, dtype: float64

In [48]:
#remove the quasi-constant features

X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((35000, 158), (15000, 158))

### Getting rid of Duplicated Features (using Feature_Engine)


In [49]:
# set up the selector
sel = DropDuplicateFeatures(variables=None, missing_values='raise')

# find the duplicate features, this might take a while
sel.fit(X_train)

In [50]:
sel.duplicated_feature_sets_

[{'var_148', 'var_37'},
 {'var_199', 'var_84'},
 {'var_143', 'var_296'},
 {'var_177', 'var_250'},
 {'var_226', 'var_232'},
 {'var_229', 'var_269'}]

In [51]:
# these are the features that will be dropped
# 1 from each of the pairs above

sel.features_to_drop_

{'var_148', 'var_199', 'var_232', 'var_250', 'var_269', 'var_296'}

In [52]:
# let's explore our list of duplicated features

len(sel.features_to_drop_)

6

In [53]:
# remove the duplicated features

X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((35000, 152), (15000, 152))

### Stacking Feature Selection in a Pipeline

In [56]:
from sklearn.pipeline import Pipeline

In [57]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1), # drop the target
    data['target'], # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [58]:
pipe = Pipeline([
    ('constant', DropConstantFeatures(tol=0.998)), #quasi constant features
    ('duplicated', DropDuplicateFeatures()), #duplicated features
])

pipe.fit(X_train)

In [59]:
# remove features

X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

X_train.shape, X_test.shape

((35000, 152), (15000, 152))

In [60]:
# we can navigate the pipeline transformers

len(pipe.named_steps['constant'].features_to_drop_)

142

In [61]:
pipe.named_steps['duplicated'].features_to_drop_

{'var_148', 'var_199', 'var_232', 'var_250', 'var_269', 'var_296'}