In [1]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

uci_features = ['28',  '48',  '64', '105', '128', '153', '241', '281', '318', '336', 
                '338', '378', '433', '442', '451', '453', '455', '472', '475', '493']

madelon_features = ['feat_257', 'feat_269', 'feat_308', 'feat_315', 'feat_336',
                   'feat_341', 'feat_395', 'feat_504', 'feat_526', 'feat_639',
                   'feat_681', 'feat_701', 'feat_724', 'feat_736', 'feat_769',
                   'feat_808', 'feat_829', 'feat_867', 'feat_920', 'feat_956']

Xuci_1 = Xuci_1[uci_features]
Xuci_2 = Xuci_2[uci_features]
Xuci_3 = Xuci_3[uci_features]

Xdb_1 = Xdb_1[madelon_features]
Xdb_2 = Xdb_2[madelon_features]
Xdb_3 = Xdb_3[madelon_features]

In [2]:
from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel 
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV



## Trying SelectKBest()

In [3]:
def skb_features(X, y, k=5):
    skb = SelectKBest(k=k)
    skb.fit(X, y)
    
    ps = list(skb.pvalues_)
    features = skb.get_support()
    
    return ps, features

In [4]:
Xuci_1_pvals, Xuci_1_kfeatures = skb_features(Xuci_1, yuci_1)
Xuci_2_pvals, Xuci_2_kfeatures = skb_features(Xuci_2, yuci_2)
Xuci_3_pvals, Xuci_3_kfeatures = skb_features(Xuci_3, yuci_3)

Xdb_1_pvals, Xdb_1_kfeatures = skb_features(Xdb_1, ydb_1)
Xdb_2_pvals, Xdb_2_kfeatures = skb_features(Xdb_2, ydb_2)
Xdb_3_pvals, Xdb_3_kfeatures = skb_features(Xdb_3, ydb_3)


In [5]:
print(Xuci_1.columns[Xuci_1_kfeatures])
print(Xuci_2.columns[Xuci_2_kfeatures])
print(Xuci_3.columns[Xuci_3_kfeatures])

Index(['64', '128', '241', '336', '475'], dtype='object')
Index(['241', '338', '442', '472', '475'], dtype='object')
Index(['28', '64', '241', '336', '475'], dtype='object')


Similar but not identical results

In [6]:
print(Xdb_1.columns[Xdb_1_kfeatures])
print(Xdb_2.columns[Xdb_2_kfeatures])
print(Xdb_3.columns[Xdb_3_kfeatures])

Index(['feat_269', 'feat_341', 'feat_681', 'feat_769', 'feat_920'], dtype='object')
Index(['feat_269', 'feat_341', 'feat_681', 'feat_769', 'feat_920'], dtype='object')
Index(['feat_269', 'feat_341', 'feat_681', 'feat_701', 'feat_920'], dtype='object')


Very similar results

### SKB isn't returning consitent matches, but what if we bootstrap each sample?

In [24]:
type(yuci_1)

pandas.core.series.Series

In [23]:
test.values()

dict_values([[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []])

In [31]:
def boot_skb(X, y, k=5, boots=20):
    np.random.seed(42) # set that seed
    size = len(X) // 2 # number of cases to include in each bootstrap sample
    
    pvals = dict.fromkeys(X.columns, []) # store list of pvals for each feature
    votes = dict.fromkeys(X.columns, []) # store list of votes for each feature
    
    for i in range(boots):
        boot_index = np.random.choice(X.index.tolist(), size = size, replace=True)
        
        X_bs = X.iloc[boot_index, :]
        y_bs = y[boot_index]
        
        print(X_bs.shape, y_bs.shape)
        
        ps, features = skb_features(X_bs, y_bs, k=k)
        
        for i, col in enumerate(X_bs.columns):
            pvals[col].append(ps[i])
            votes[col].append(features[i])
            
    pvals_df = pd.DataFrame(pvals)
    votes_df = pd.DataFrame(votes)
    
#     mean_pvals = pvals_df.mean()
#     sum_votes = votes_df.sum()
    
#     return mean_pvals, sum_votes
    return pvals_df, votes_df

In [32]:
Xuci_1_bs_pvals, Xuci_1_bs_kfeatures = boot_skb(Xuci_1, yuci_1)


In [34]:
Xuci_1_bs_pvals.head()

Unnamed: 0,105,128,153,241,28,281,318,336,338,378,433,442,451,453,455,472,475,48,493,64
0,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379,0.434379
1,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944,0.441944
2,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172,0.045172
3,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472,0.907472
4,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659,0.945659


In [35]:
Xuci_1_pvals

[0.85418478110638096,
 0.01591694893217465,
 0.00077199864134114738,
 0.0023714703575487659,
 0.0021655023778543764,
 0.17490184745780707,
 0.00010657986778399847,
 0.18900330082692063,
 0.74528193128166653,
 0.0011154458950320479,
 0.076050927383207276,
 0.023869593040519856,
 0.27553339989094505,
 0.28947115840947424,
 0.79353599411615794,
 0.2857258563381041,
 0.19600953292858772,
 0.30384586233707839,
 0.00026968129063236797,
 0.25753056628705001]