In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
np.random.seed( 333 )

##### a1 is a binary variable

In [2]:
DATA = pd.DataFrame( { 'a1' : np.random.binomial( 1, 0.5, 10000 ) } )

##### "label" is a continuous target variable

In [3]:
DATA[ 'label' ] = np.random.normal( 0, 1, 10000 )

##### b1 is closely related with a1 in a way similar to the logical XOR relationship. When a1 and b1 are equal (0 and 0, 1 and 1), the target (label) takes low values. When they are different (0 and 1, 1 and 0), label takes high values. This kind of relationship is usually hard to capture with simple model techniques without knowing it explicitly.

In [4]:
DATA[ 'b1' ] = np.where( DATA.label < 0, DATA.a1, 1 - DATA.a1 )

In [5]:
DATA[[ 'a1', 'b1', 'label' ]].sample( 5 )

Unnamed: 0,a1,b1,label
6357,1,0,0.903201
2150,1,0,1.643511
1609,1,1,-0.897972
9972,0,1,1.051045
1582,0,0,-0.912961


##### Separately, a1 and b1 have no linear correlation with label

In [6]:
DATA[[ 'a1', 'b1', 'label' ]].corr()

Unnamed: 0,a1,b1,label
a1,1.0,-0.018989,0.006929
b1,-0.018989,1.0,0.002214
label,0.006929,0.002214,1.0


##### But their imaginary right combination has high correlation

In [7]:
pearsonr( np.where( DATA.a1 == DATA.b1, 1, 0 ), 
          DATA.label )[ 0 ]

-0.7998492144038691

##### a2 and b2 are weaker versions of a1 and b1. They would be redundant in the final set.

In [8]:
DATA[ 'a2' ] = DATA.a1 + np.random.normal( 0, 1, 10000 )
DATA[ 'b2' ] = DATA.b1 + np.random.normal( 0, 1, 10000 )

In [9]:
DATA[[ 'a1', 'a2', 'b1', 'b2' ]].sample( 5 )

Unnamed: 0,a1,a2,b1,b2
5733,0,-0.225544,0,1.478414
2626,1,0.631186,1,1.65782
7522,1,-0.429607,0,0.02805
4768,0,-1.145667,1,1.25959
5927,1,2.292783,0,-0.955425


##### c1 is in a simple linear but weak relationship with label

In [10]:
DATA[ 'c1' ] = DATA.label + np.random.normal( 0, 2, 10000 )

##### c2 and c3 are weaker duplicates of c1. They are also not welcomed in the final set.

In [11]:
DATA[ 'c2' ] = DATA.c1 + np.random.normal( 0, 2, 10000 )
DATA[ 'c3' ] = DATA.c1 + np.random.normal( 0, 2, 10000 )

In [12]:
DATA[[ 'c1', 'c2', 'c3', 'label' ]].corr()

Unnamed: 0,c1,c2,c3,label
c1,1.0,0.74716,0.740863,0.43951
c2,0.74716,1.0,0.55741,0.32982
c3,0.740863,0.55741,1.0,0.32316
label,0.43951,0.32982,0.32316,1.0


##### d1, d2 and d3 are absolute garbage. We definetaly do not want to select them.

In [13]:
DATA[ 'd1' ] = np.random.normal( 1, 1, 10000 )
DATA[ 'd2' ] = np.random.normal( 2, 1, 10000 )
DATA[ 'd3' ] = np.random.normal( 3, 1, 10000 )

##### Thus, the most important variables are a1 and b1. c1 brings some added value, too. So, ideally, we would like to select a1, b1 and c1 for fitting a model. We would like to skip the duplicate and garbage variables because they do not provide additional information and only make the model more complicated and harder to compute and optimize.

In [14]:
DATA[[ 'label', 'a1', 'a2', 'b1', 'b2', 'c1', 'c2', 'c3', 
      'd1', 'd2', 'd3' ]].to_csv( '~/Documents/GitHub/feature-selection-spark/data.csv', index=False )