In [4]:
import numpy as np
import pandas as pd

from sklearn.feature_selection import VarianceThreshold

In [6]:
from sklearn.datasets import load_boston

In [7]:
boston_data = load_boston()
boston_data.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [8]:
boston_data.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [9]:
bd = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)
bd.rename(columns = {
                     'CRIM':'crime rate',
                     'ZN':'residential land zone',
                     'INDUS':'business acres',
                     'CHAS':'Charles River',
                     'NOX':'nitric oxides',
                     'RM':'number of rooms',
                     'AGE':'age',
                     'DIS':'distances to employment centres',
                     'RAD':'accessibility to highways',
                     'TAX':'property-tax',
                     'PTRATIO':'pupil-teacher ratio',
                     'B':'lower income levels',
                     'LSTAT':'Median Home Value',
                    }, inplace = True)

bd.sample(10)
bd.to_csv('Data/boston.csv', index=False)

In [10]:
boston_df = pd.read_csv('Data/boston.csv')
boston_df.sample(10)

Unnamed: 0,crime rate,residential land zone,business acres,Charles River,nitric oxides,number of rooms,age,distances to employment centres,accessibility to highways,property-tax,pupil-teacher ratio,lower income levels,Median Home Value
103,0.21161,0.0,8.56,0.0,0.52,6.137,87.4,2.7147,5.0,384.0,20.9,394.47,13.44
99,0.0686,0.0,2.89,0.0,0.445,7.416,62.5,3.4952,2.0,276.0,18.0,396.9,6.19
231,0.46296,0.0,6.2,0.0,0.504,7.412,76.9,3.6715,8.0,307.0,17.4,376.14,5.25
192,0.08664,45.0,3.44,0.0,0.437,7.178,26.3,6.4798,5.0,398.0,15.2,390.49,2.87
394,13.3598,0.0,18.1,0.0,0.693,5.887,94.7,1.7821,24.0,666.0,20.2,396.9,16.35
299,0.05561,70.0,2.24,0.0,0.4,7.041,10.0,7.8278,5.0,358.0,14.8,371.58,4.74
64,0.01951,17.5,1.38,0.0,0.4161,7.104,59.5,9.2229,3.0,216.0,18.6,393.24,8.05
20,1.25179,0.0,8.14,0.0,0.538,5.57,98.1,3.7979,4.0,307.0,21.0,376.57,21.02
41,0.12744,0.0,6.91,0.0,0.448,6.77,2.9,5.7209,3.0,233.0,17.9,385.41,4.84
17,0.7842,0.0,8.14,0.0,0.538,5.99,81.7,4.2579,4.0,307.0,21.0,386.75,14.67


In [20]:
boston_df['number of rooms'] = boston_df['number of rooms'].astype(int)

In [21]:
boston_df.dtypes

crime rate                         float64
residential land zone              float64
business acres                     float64
Charles River                      float64
nitric oxides                      float64
number of rooms                      int32
age                                float64
distances to employment centres    float64
accessibility to highways          float64
property-tax                       float64
pupil-teacher ratio                float64
lower income levels                float64
Median Home Value                  float64
dtype: object

In [37]:
fsVt = VarianceThreshold(threshold=0.9)

In [38]:
# Select the numerical columns only
boston_df_num = boston_df[boston_df.select_dtypes([np.number]).columns]
boston_df_num

Unnamed: 0,crime rate,residential land zone,business acres,Charles River,nitric oxides,number of rooms,age,distances to employment centres,accessibility to highways,property-tax,pupil-teacher ratio,lower income levels,Median Home Value
0,0.00632,18.0,2.31,0.0,0.538,6,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [39]:
# fit the object to the data
fsVt.fit(boston_df_num)

VarianceThreshold(threshold=0.9)

In [40]:
# get the constant columns
const_num_columns = [column for column in boston_df_num.columns
                     if column in boston_df_num.columns[fsVt.get_support()]]

In [41]:
const_num_columns

['crime rate',
 'residential land zone',
 'business acres',
 'age',
 'distances to employment centres',
 'accessibility to highways',
 'property-tax',
 'pupil-teacher ratio',
 'lower income levels',
 'Median Home Value']

In [42]:
# detect constant categorical variables
const_cat_columns = [column for column in boston_df.columns
                     if (boston_df[column].dtype == 'O' and len(boston_df[column].unique()) == 1)]
const_cat_columns

[]