In [1]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

uci_features = ['28',  '48',  '64', '105', '128', '153', '241', '281', '318', '336', 
                '338', '378', '433', '442', '451', '453', '455', '472', '475', '493']

madelon_features = ['feat_257', 'feat_269', 'feat_308', 'feat_315', 'feat_336',
                   'feat_341', 'feat_395', 'feat_504', 'feat_526', 'feat_639',
                   'feat_681', 'feat_701', 'feat_724', 'feat_736', 'feat_769',
                   'feat_808', 'feat_829', 'feat_867', 'feat_920', 'feat_956']

Xuci_1 = Xuci_1[uci_features]
Xuci_2 = Xuci_2[uci_features]
Xuci_3 = Xuci_3[uci_features]

Xdb_1 = Xdb_1[madelon_features]
Xdb_2 = Xdb_2[madelon_features]
Xdb_3 = Xdb_3[madelon_features]

# !conda install -y psycopg2

from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel, RFECV 
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV

## Trying SelectKBest()

In [9]:
def skb_features(X, y, k=5):
    skb = SelectKBest(k=k)
    skb.fit(X, y)
    
    ps = list(skb.pvalues_)
    features = skb.get_support()
    
    return ps, features

In [10]:
Xuci_1_pvals, Xuci_1_kfeatures = skb_features(Xuci_1, yuci_1)
Xuci_2_pvals, Xuci_2_kfeatures = skb_features(Xuci_2, yuci_2)
Xuci_3_pvals, Xuci_3_kfeatures = skb_features(Xuci_3, yuci_3)

Xdb_1_pvals, Xdb_1_kfeatures = skb_features(Xdb_1, ydb_1)
Xdb_2_pvals, Xdb_2_kfeatures = skb_features(Xdb_2, ydb_2)
Xdb_3_pvals, Xdb_3_kfeatures = skb_features(Xdb_3, ydb_3)


In [11]:
print(Xuci_1.columns[Xuci_1_kfeatures])
print(Xuci_2.columns[Xuci_2_kfeatures])
print(Xuci_3.columns[Xuci_3_kfeatures])

Index(['64', '128', '241', '336', '475'], dtype='object')
Index(['241', '338', '442', '472', '475'], dtype='object')
Index(['28', '64', '241', '336', '475'], dtype='object')


Similar but not identical results

In [12]:
print(Xdb_1.columns[Xdb_1_kfeatures])
print(Xdb_2.columns[Xdb_2_kfeatures])
print(Xdb_3.columns[Xdb_3_kfeatures])

Index(['feat_269', 'feat_341', 'feat_681', 'feat_769', 'feat_920'], dtype='object')
Index(['feat_269', 'feat_341', 'feat_681', 'feat_769', 'feat_920'], dtype='object')
Index(['feat_269', 'feat_341', 'feat_681', 'feat_701', 'feat_920'], dtype='object')


Very similar results

### SKB isn't returning consitent matches, but what if we bootstrap each sample?

In [137]:
def boot_skb(X, y, k=5, boots=20):
    np.random.seed(42) # set that seed
    size = len(X) #DecisionTreeClassifier// 2 # number of cases to include in each bootstrap sample
    
    pvals = [] # store list of lists for pvals 
    votes = [] # store list of lists for votes 
    
    for i in range(boots):
        boot_index = np.random.choice(X.index.tolist(), size = size, replace=True)
        
        X_bs = X.iloc[boot_index, :]
        y_bs = y[boot_index]
                
        ps, features = skb_features(X_bs, y_bs, k=k)
                
        pvals.append(ps)
        votes.append(features)
            
    pvals_df = pd.DataFrame(pvals, columns = X.columns)
    votes_df = pd.DataFrame(votes, columns = X.columns)
    
    mean_pvals = pvals_df.mean()
    sum_votes = votes_df.sum()
    
    return mean_pvals, sum_votes


In [139]:
Xuci_1_pvals, Xuci_1_kfeatures = boot_skb(Xuci_1, yuci_1, boots = 1000)
Xuci_2_pvals, Xuci_2_kfeatures = boot_skb(Xuci_2, yuci_2, boots = 1000)
Xuci_3_pvals, Xuci_3_kfeatures = boot_skb(Xuci_3, yuci_3, boots = 1000)

Xdb_1_pvals, Xdb_1_kfeatures = boot_skb(Xdb_1, ydb_1, boots = 1000)
Xdb_2_pvals, Xdb_2_kfeatures = boot_skb(Xdb_2, ydb_2, boots = 1000)
Xdb_3_pvals, Xdb_3_kfeatures = boot_skb(Xdb_3, ydb_3, boots = 1000)

In [140]:
print(set(Xuci_1_kfeatures.sort_values(ascending=False)[:5].index))
print(set(Xuci_2_kfeatures.sort_values(ascending=False)[:5].index))
print(set(Xuci_3_kfeatures.sort_values(ascending=False)[:5].index))

{'336', '128', '475', '64', '241'}
{'472', '338', '475', '442', '241'}
{'28', '336', '475', '64', '241'}


In [141]:
print(set(Xdb_1_kfeatures.sort_values(ascending=False)[:5].index))
print(set(Xdb_2_kfeatures.sort_values(ascending=False)[:5].index))
print(set(Xdb_3_kfeatures.sort_values(ascending=False)[:5].index))

{'feat_769', 'feat_920', 'feat_341', 'feat_269', 'feat_681'}
{'feat_769', 'feat_920', 'feat_341', 'feat_269', 'feat_681'}
{'feat_920', 'feat_341', 'feat_701', 'feat_269', 'feat_681'}


Neither set of samples are returning consistent sets of features when using 10000 bootstrap samples! Before moving on, let's check the top 10 features.

In [143]:
Xuci_1_pvals, Xuci_1_kfeatures = boot_skb(Xuci_1, yuci_1, k=10, boots = 1000)
Xuci_2_pvals, Xuci_2_kfeatures = boot_skb(Xuci_2, yuci_2, k=10, boots = 1000)
Xuci_3_pvals, Xuci_3_kfeatures = boot_skb(Xuci_3, yuci_3, k=10, boots = 1000)

Xdb_1_pvals, Xdb_1_kfeatures = boot_skb(Xdb_1, ydb_1, k=10, boots = 1000)
Xdb_2_pvals, Xdb_2_kfeatures = boot_skb(Xdb_2, ydb_2, k=10, boots = 1000)
Xdb_3_pvals, Xdb_3_kfeatures = boot_skb(Xdb_3, ydb_3, k=10, boots = 1000)

In [148]:
print(Xuci_1_kfeatures.sort_values(ascending=False)[:10].index.sort_values())
print(Xuci_2_kfeatures.sort_values(ascending=False)[:10].index.sort_values())
print(Xuci_3_kfeatures.sort_values(ascending=False)[:10].index.sort_values())

Index(['105', '128', '153', '241', '336', '338', '378', '475', '48', '64'], dtype='object')
Index(['128', '241', '336', '338', '442', '453', '472', '475', '493', '64'], dtype='object')
Index(['241', '28', '318', '336', '451', '453', '475', '48', '493', '64'], dtype='object')


In [150]:
print(Xdb_1_kfeatures.sort_values(ascending=False)[:10].index.sort_values())
print(Xdb_2_kfeatures.sort_values(ascending=False)[:10].index.sort_values())
print(Xdb_3_kfeatures.sort_values(ascending=False)[:10].index.sort_values())

Index(['feat_269', 'feat_336', 'feat_341', 'feat_681', 'feat_701', 'feat_736',
       'feat_769', 'feat_808', 'feat_829', 'feat_920'],
      dtype='object')
Index(['feat_269', 'feat_315', 'feat_336', 'feat_341', 'feat_681', 'feat_701',
       'feat_769', 'feat_808', 'feat_829', 'feat_920'],
      dtype='object')
Index(['feat_269', 'feat_315', 'feat_341', 'feat_681', 'feat_701', 'feat_736',
       'feat_769', 'feat_808', 'feat_829', 'feat_920'],
      dtype='object')


# RFE

In [125]:
def get_rfe(X, y, estimator = DecisionTreeClassifier(max_depth=10)):
    rfe = RFE(estimator = estimator, n_features_to_select=5)
    rfe.fit(X, y)
    
    return X.columns[rfe.get_support()]
     


In [117]:
get_rfe(Xuci_1, yuci_1)
get_rfe(Xuci_2, yuci_2)
get_rfe(Xuci_3, yuci_3)

get_rfe(Xdb.sort_values())_1, ydb_1)
get_rfe(Xdb_2, ydb_2)
get_rfe(Xdb_3, ydb_3)

Index(['105', '153', '241', '442', '493'], dtype='object')
Index(['48', '105', '153', '338', '475'], dtype='object')
Index(['48', '153', '318', '338', '493'], dtype='object')
Index(['feat_257', 'feat_269', 'feat_341', 'feat_808', 'feat_920'], dtype='object')
Index(['feat_269', 'feat_395', 'feat_504', 'feat_769', 'feat_829'], dtype='object')
Index(['feat_269', 'feat_308', 'feat_724', 'feat_769', 'feat_829'], dtype='object')


In [118]:
get_rfe(Xuci_1, yuci_1, LogisticRegression())
get_rfe(Xuci_2, yuci_2, LogisticRegression())
get_rfe(Xuci_3, yuci_3, LogisticRegression())

get_rfe(Xdb_1, ydb_1, LogisticRegression())
get_rfe(Xdb_2, ydb_2, LogisticRegression())
get_rfe(Xdb_3, ydb_3, LogisticRegression())

Index(['28', '128', '153', '318', '433'], dtype='object')
Index(['28', '48', '128', '378', '451'], dtype='object')
Index(['28', '48', '128', '281', '378'], dtype='object')
Index(['feat_257', 'feat_269', 'feat_308', 'feat_681', 'feat_829'], dtype='object')
Index(['feat_257', 'feat_269', 'feat_308', 'feat_681', 'feat_829'], dtype='object')
Index(['feat_257', 'feat_269', 'feat_395', 'feat_681', 'feat_829'], dtype='object')


Also getting inconsitent results, particularly accross estimators. Let's try bootstrapping here as well.

In [126]:
def boot_rfe(X, y, estimator = DecisionTreeClassifier(max_depth=10), boots=20):
    np.random.seed(42) # set that seed
    size = len(X) #DecisionTreeClassifier// 2 # number of cases to include in each bootstrap sample
    
    votes = {} # store list of lists for votes 
    
    for i in range(boots):
        boot_index = np.random.choice(X.index.tolist(), size = size, replace=True)
        
        X_bs = X.iloc[boot_index, :]
        y_bs = y[boot_index]
                
        features = get_rfe(X_bs, y_bs, estimator)
                
        for fea in features:
            if fea in votes.keys():
                votes[fea] += 1
            else:
                votes[fea] = 1
        
    return votes


In [133]:
Xuci_1_rfe_features = boot_rfe(Xuci_1, yuci_1, boots = 1000)
Xuci_2_rfe_features = boot_rfe(Xuci_2, yuci_2, boots = 1000)
Xuci_3_rfe_features = boot_rfe(Xuci_3, yuci_3, boots = 1000)

Xdb_1_rfe_features = boot_rfe(Xdb_1, ydb_1, boots = 1000)
Xdb_2_rfe_features = boot_rfe(Xdb_2, ydb_2, boots = 1000)
Xdb_3_rfe_features = boot_rfe(Xdb_3, ydb_3, boots = 1000)


In [134]:
print(pd.Series(Xuci_1_rfe_features).sort_values(ascending = False)[:5].index)
print(pd.Series(Xuci_2_rfe_features).sort_values(ascending = False)[:5].index)
print(pd.Series(Xuci_3_rfe_features).sort_values(ascending = False)[:5].index)

Index(['338', '48', '105', '442', '241'], dtype='object')
Index(['338', '48', '475', '105', '241'], dtype='object')
Index(['338', '378', '318', '241', '442'], dtype='object')


In [135]:
print(pd.Series(Xdb_1_rfe_features).sort_values(ascending = False)[:5].index)
print(pd.Series(Xdb_2_rfe_features).sort_values(ascending = False)[:5].index)
print(pd.Series(Xdb_3_rfe_features).sort_values(ascending = False)[:5].index)

Index(['feat_269', 'feat_808', 'feat_341', 'feat_829', 'feat_504'], dtype='object')
Index(['feat_269', 'feat_808', 'feat_681', 'feat_724', 'feat_395'], dtype='object')
Index(['feat_269', 'feat_808', 'feat_724', 'feat_829', 'feat_769'], dtype='object')


Blast! Still inconsistent results

### Let's take a look at RandomForest's feature importance

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

In [4]:
def rfc_gridsearch(X, y):
    rfc_param = {'n_estimators': [10, 50, 100, 500, 1000],
                'max_depth': [1, 5, 10, None],
                'max_features': ['auto', 'log2', 'sqrt']}

    rfc_gs = GridSearchCV(RandomForestClassifier(),
                         rfc_param,
                         cv=5,
                         n_jobs = -1)
    
    rfc_gs.fit(X, y)
    
    importance_df = pd.DataFrame(list(zip(X.columns, rfc_gs.best_estimator_.feature_importances_)), 
                                 columns = ['Feature', 'Importance'])
    
    return importance_df

In [5]:
uci1_rfgs_importance = rfc_gridsearch(Xuci_1, yuci_1)
uci1_rfgs_importance = rfc_gridsearch(Xuci_2, yuci_2)
uci3_rfgs_importance = rfc_gridsearch(Xuci_3, yuci_3)
db1_rfgs_importance = rfc_gridsearch(Xdb_1, ydb_1)
db2_rfgs_importance = rfc_gridsearch(Xdb_2, ydb_2)
db3_rfgs_importance = rfc_gridsearch(Xdb_3, ydb_3)

In [6]:
2+2

4

In [16]:
uci1_rfgs_importance.sort_values('Importance', ascending=False).head().T

Unnamed: 0,10,17,8,13,1
Feature,338.0,472.0,318.0,442.0,48.0
Importance,0.0652007,0.0619885,0.0619427,0.0605235,0.0569306


In [18]:
uci2_rfgs_importance.sort_values('Importance', ascending=False).head().T

Unnamed: 0,10,3,4,1,18
Feature,338.0,105.0,128.0,48.0,475.0
Importance,0.0751968,0.0643718,0.0595751,0.0563328,0.0557993


In [17]:
uci3_rfgs_importance.sort_values('Importance', ascending=False).head().T

Unnamed: 0,1,11,10,3,17
Feature,48.0,378.0,338.0,105.0,472.0
Importance,0.0625072,0.0604073,0.0590446,0.0587472,0.0562348


In [19]:
db1_rfgs_importance.sort_values('Importance', ascending=False).head().T

Unnamed: 0,1,15,18,5,3
Feature,feat_269,feat_808,feat_920,feat_341,feat_315
Importance,0.0690522,0.0619324,0.0541012,0.0540041,0.0532551


In [20]:
db2_rfgs_importance.sort_values('Importance', ascending=False).head().T

Unnamed: 0,10,1,12,6,3
Feature,feat_681,feat_269,feat_724,feat_395,feat_315
Importance,0.0635261,0.0619245,0.0603386,0.0565221,0.0543787


In [21]:
db3_rfgs_importance.sort_values('Importance', ascending=False).head().T

Unnamed: 0,1,12,10,15,14
Feature,feat_269,feat_724,feat_681,feat_808,feat_769
Importance,0.071768,0.0642979,0.0567224,0.0561686,0.0541231
