In [49]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
np.random.seed( 333 )

##### A1 is a binary variable

In [3]:
DATA = pd.DataFrame( { 'A1' : np.random.binomial( 1, 0.5, 10000 ) } )

##### Y is a continuous target variable

In [4]:
DATA[ 'Y' ] = np.random.normal( 0, 1, 10000 )

##### B1 is closely related with A1 in a way similar to the logical XOR relationship. When A1 and B1 are equal (0 and 0, 1 and 1), the target (Y) takes low values. When they are different (0 and 1, 1 and 0), Y takes high values. This kind of relationship is usually hard to capture with simple model techniques without knowing it explicitly.

In [5]:
DATA[ 'B1' ] = np.where( DATA.Y < 0, DATA.A1, 1 - DATA.A1 )

In [37]:
DATA[[ 'A1', 'B1', 'Y' ]].sample( 5 )

Unnamed: 0,A1,B1,Y
8041,1,0,0.535345
8515,1,1,-0.589059
741,0,1,0.628249
6758,1,1,-0.557248
8967,0,0,-1.17318


##### Separately, A1 and B1 have no linear correlation with Y

In [52]:
DATA[[ 'A1', 'B1', 'Y' ]].corr()

Unnamed: 0,A1,B1,Y
A1,1.0,-0.018989,0.006929
B1,-0.018989,1.0,0.002214
Y,0.006929,0.002214,1.0


##### But their imaginary right combination has high correlation

In [14]:
pearsonr( np.where( DATA.A1 == DATA.B1, 1, 0 ), 
          DATA.Y )[ 0 ]

-0.7998492144038691

##### A2 and B2 are weaker versions of A1 and B1. They would be redundant in the final set.

In [41]:
DATA[ 'A2' ] = DATA.A1 + np.random.normal( 0, 1, 10000 )
DATA[ 'B2' ] = DATA.B1 + np.random.normal( 0, 1, 10000 )

In [44]:
DATA[[ 'A1', 'A2', 'B1', 'B2' ]].sample( 5 )

Unnamed: 0,A1,A2,B1,B2
9957,0,1.434441,1,-0.795032
3311,1,2.825751,1,-0.030232
7563,0,0.535326,0,-1.360884
3747,1,0.104514,0,-1.576355
5469,0,0.316567,0,-0.000818


##### C1 is in a simple linear but weak relationship with Y

In [27]:
DATA[ 'C1' ] = DATA.Y + np.random.normal( 0, 2, 10000 )

##### C2 and C3 are weaker duplicates of C1. They are also not welcomed in the final set.

In [38]:
DATA[ 'C2' ] = DATA.C1 + np.random.normal( 0, 1, 10000 )
DATA[ 'C3' ] = DATA.C1 + np.random.normal( 0, 1, 10000 )

In [39]:
DATA[[ 'C1', 'C2', 'C3', 'Y' ]].corr()

Unnamed: 0,C1,C2,C3,Y
C1,1.0,0.911864,0.911564,0.439423
C2,0.911864,1.0,0.833848,0.399192
C3,0.911564,0.833848,1.0,0.397393
Y,0.439423,0.399192,0.397393,1.0


##### D1, D2 and D3 are absolute garbage. We definetaly do not want to select them.

In [40]:
DATA[ 'D1' ] = np.random.normal( 1, 1, 10000 )
DATA[ 'D2' ] = np.random.normal( 2, 1, 10000 )
DATA[ 'D3' ] = np.random.normal( 3, 1, 10000 )

##### Thus, the most important variables are A1 and B1. C1 brings some added value, too. So, ideally, we would like to select A1, B1 and C1 for fitting a model. We would like to skip the duplicate and garbage variables because they do not provide additional information and only make the model more complicated and harder to compute and optimize.

In [54]:
DATA[[ 'A1', 'A2', 'B1', 'B2', 'C1', 'C2', 'C3', 'D1', 'D2', 'D3',
     'Y' ]].to_csv( '~/Documents/GitHub/feature-selection-spark/data.csv', index=False )