In [6]:
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pmlb as dsets
import numpy as np
import pickle as pkl
from os.path import join as oj
from copy import deepcopy

# sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# which classification dsets is rf better than logistic?
this code will save all the **classification** dsets and fit very quick models to each of them

In [7]:
data_dir = '/scratch/users/vision/data/pmlb'
out_dir = '/scratch/users/vision/chandan/pmlb'
dset_names = deepcopy(dsets.classification_dataset_names)
dset_names.remove('kddcup') # # remove biggest dset
dset_names.remove('mnist') # # remove biggest dset
dset_names.remove('poker') # # remove biggest dset

dset_names = np.array(dset_names)

In [None]:
logit_test_scores = []
rf_test_scores = []
rfs = []

for dset_name in tqdm(dset_names):
    X, y = dsets.fetch_data(dset_name, return_X_y=True, 
                      local_cache_dir=data_dir)
    
    
    train_X, test_X, train_y, test_y = train_test_split(X, y)
    
    logit = LogisticRegression(solver='liblinear', multi_class='auto') # liblinear best for small dsets, otherwise lbfgs
    rf = RandomForestClassifier(n_estimators=100)
#     print(dset_name, X.shape)
    logit.fit(train_X, train_y)
    rf.fit(train_X, train_y)
    
    logit_test_scores.append(logit.score(test_X, test_y))
    rf_test_scores.append(rf.score(test_X, test_y))
    rfs.append(deepcopy(rf))
    
# save
logit_test_scores = np.array(logit_test_scores)
rf_test_scores = np.array(rf_test_scores)
classification_results = {'logit_test_scores': logit_test_scores,
           'rf_test_scores': rf_test_scores,
           'dset_names': dset_names,
           'rfs': rfs}
pkl.dump(classification_results, 
         open(oj(out_dir, 'classification_results.pkl'), 'wb'))

# plot
sns.boxplot(data=[logit_test_scores, rf_test_scores], notch=False)
plt.xticks([0, 1], ['Logistic Regression', 'Random Forest'])
plt.ylabel('Test Accuracy')
plt.show()

 44%|████▎     | 71/163 [00:24<00:28,  3.17it/s]

analyze results

In [8]:
classification_results = pkl.load(open(oj(out_dir, 'classification_results.pkl'), 'rb'))

idxs = rf_test_scores - logit_test_scores > 0.1
np.sum(idxs)

37

# rank features

look at one dset

In [None]:
dset_name = dsets.classification_dataset_names[0]
X, y = dsets.fetch_data(classification_dataset, return_X_y=True, 
                  local_cache_dir=data_dir)
train_X, test_X, train_y, test_y = train_test_split(X, y)

logit = LogisticRegression(solver='liblinear', multi_class='auto') # liblinear best for small dsets, otherwise lbfgs
rf = RandomForestClassifier(n_estimators=100)
print(dset_name, X.shape)
logit.fit(train_X, train_y)
rf.fit(train_X, train_y)

logit_test_scores.append(logit.score(test_X, test_y))
rf_test_scores.append(rf.score(test_X, test_y))

In [None]:
# arg1 -  train_x: N x p
# arg2 - train_y: N x 1
# returns: list of idxs p ranked by how important each of them are - each in the range [0, p)
def rank_features_single(train_X, train_y):
    return

# arg1 -  train_x: N x p
# arg2 - train_y: N x 1
# returns: list of tuples of idxs ranking how important each of the pairwise interactions are - each in the range [0, p)
def rank_pairwise_interactions(train_X, train_y):
    return

# arg1 -  train_x: N x p
# arg2 -  test_X: N x p
# arg3 - train_y: N x 1
# arg4 - test_y: N x 1
# idxs - list of either individual features or pairs of features to add to the logistic regression
# returns: train + test accuracy after adding each of the idxs
def add_features_and_train_logistic(train_X, test_X, train_y, test_y, idxs)
    return