-------------------------------
#### Variance threshold
---------------------------

In [30]:
# Importing libraries
import numpy as np
np.set_printoptions(suppress=True, precision=4)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# for feature selection
from sklearn.feature_selection import VarianceThreshold

from sklearn import datasets

In [31]:
# Load iris data
iris = datasets.load_iris()

In [32]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [33]:
# Create features and target
X = iris.data
y = iris.target

In [34]:
# Create VarianceThreshold object with a variance with a threshold of 0.5
thresholder     = VarianceThreshold(threshold=.5)

thresholder.fit(X)

In [35]:
thresholder.get_params()

{'threshold': 0.5}

In [48]:
thresholder.get_support(indices=True)

array([0, 2, 3], dtype=int64)

In [37]:
# Variances of individual features.
thresholder.variances_

array([0.6811, 0.1887, 3.0955, 0.5771])

In [38]:
# Conduct variance thresholding
# X_high_variance = thresholder.transform(X)

#### one more dataset

In [41]:
from sklearn.datasets import fetch_california_housing

In [42]:
# Load California Housing dataset
data = fetch_california_housing()
X, y = pd.DataFrame(data.data, columns=data.feature_names), pd.Series(data.target)

In [43]:
# Apply variance thresholding
threshold = 0.6  # You can adjust this threshold based on your preference
selector = VarianceThreshold(threshold=threshold)

X_high_variance = selector.fit(X)

In [49]:
selector.get_support(indices=True)

array([0, 1, 2, 4, 5, 6, 7], dtype=int64)

In [44]:
selector.variances_

array([      3.6091,     158.3886,       6.1212,       0.2246,
       1282408.322 ,     107.8648,       4.5621,       4.0139])

In [50]:
# Get the selected features
selected_features = X.columns[selector.get_support()]

In [51]:
# Display the results
print(f"Original number of features: {X.shape[1]}")

print(f"Selected features: {selected_features.tolist()}")

Original number of features: 8
Selected features: ['MedInc', 'HouseAge', 'AveRooms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
