# Variance Threshold (filter method)

In [1]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

data = {
    "feature1":[11,21,31,41,51,61,71,81,91,99],
    "feature2":[1,2,3,4,5,6,7,8,9,10],
    "feature3":[2,2,2,2,2,2,2,2,2,2],
    "feature4":[15,25,35,45,55,65,75,85,95,10],
    "feature5":[0,0,0,0,0,0,0,0,0,0]
}

df = pd.DataFrame(data)

**The 3rd and 5th columns contains all constants i.e their variance is 0.**

In [2]:
df.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5
0,11,1,2,15,0
1,21,2,2,25,0
2,31,3,2,35,0
3,41,4,2,45,0
4,51,5,2,55,0


**It takes the "variance" as parameter & removes all columns with that variance.**

In [5]:
var_threshold = VarianceThreshold(0)

var_threshold.fit(df)

VarianceThreshold(threshold=0)

**var_threshold.get_support( ) gives a boolean list , containing True for optimal columns & false for columns with the given variance.**

In [6]:
var_threshold = VarianceThreshold(0)

var_threshold.fit(df)

var_threshold.get_support()

array([ True,  True, False,  True, False])

In [7]:
optimal_columns = df.columns[var_threshold.get_support()]
constant_columns = []

for column in df.columns:
    if column not in optimal_columns:
        constant_columns.append(column)

In [9]:
df_optimal = df.drop(constant_columns,axis="columns")
df_constants = df.drop(optimal_columns,axis="columns")

df_constants.head()

Unnamed: 0,feature3,feature5
0,2,0
1,2,0
2,2,0
3,2,0
4,2,0


In [10]:
df_optimal.head()

Unnamed: 0,feature1,feature2,feature4
0,11,1,15
1,21,2,25
2,31,3,35
3,41,4,45
4,51,5,55


**Note : This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.**