In [2]:
import numpy as np
import openml
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold
# from proglearn.forest import UncertaintyForest

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import KMeans

from graspologic.embed import AdjacencySpectralEmbed
from graspologic.simulations import sbm
from graspologic.plot import heatmap, pairplot

from sklearn.utils.extmath import svd_flip
from sklearn.decomposition._pca import _infer_dimension
from scipy import linalg

import warnings
warnings.filterwarnings('ignore')
np.random.seed(8889)
%matplotlib inline

In [4]:
from sklearn.utils.extmath import svd_flip
from sklearn.decomposition._pca import _infer_dimension
from scipy import linalg

def minka(X):
    
    n_components = 'mle'
    n_samples = X.shape[0]
    U, S, Vt = linalg.svd(X, full_matrices=False)
    # flip eigenvectors' sign to enforce deterministic output
    U, Vt = svd_flip(U, Vt)

    components_ = Vt

    # Get variance explained by singular values
    explained_variance_ = (S ** 2) / (n_samples - 1)
    total_var = explained_variance_.sum()
    explained_variance_ratio_ = explained_variance_ / total_var
    singular_values_ = S.copy()  # Store the singular values.
    # Postprocess the number of components required
    if n_components == 'mle':
        #print(explained_variance_.shape[0])
        #print(n_samples)
        n_components = \
            _infer_dimension(explained_variance_, n_samples)
    return n_components

In [5]:
from graspologic.embed import select_dimension

In [6]:
benchmark_suite = openml.study.get_suite('OpenML-CC18') # obtain the benchmark suite

In [7]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from ya_pca.linalg_utils import svd_wrapper
from ya_pca.viz import scree_plot

#Make a random array and then make it positive-definite
# true -- where we expect value to be 
# d1 -- ZG no hack elbow 1
# d2 -- ZG no hack elbow 2
# d3 -- minka 
# d4 -- ZG  hack elbow 1
# d5 -- ZG  hack elbow 2

def scree_2(A, d1, d2, d3, d4, d5):
    U, svals, V = svd_wrapper(A)
    plt.figure(figsize=(8, 8))
    colors = sns.color_palette()
    scree_plot(svals, color = colors[4])
    #set_xlim(bottom=0, top=max(svals))
    #plt.xlim(0)   
    axes = plt.gca()
    axes.set_xlim([.5, len(svals) + 1])
    axes.set_ylim([min(svals) - 2 ,max(svals) + 2])
    plt.title('Scree Plot')
    plt.xlabel('Principal Component')
    plt.ylabel('Eigenvalue')
    ds = [int(np.median(d1)), int(np.median(d2)), int(np.median(d3)), int(np.median(d4)), int(np.median(d5))]
    locs = []
    added = set()
    for i in range(len(ds)):
        if ds[i] in added:  
            rec_ind = 0
            for j in range(0, i):
                if ds[j] == ds[i]:
                    rec_ind = j
            locs.append(locs[rec_ind] + .1)
        else:
            locs.append(ds[i])
        added.add(ds[i])
#     all_dims.append(ds)
        
    # plt.axvline(true, label='true dim:{}'.format(true), color=colors[3], linewidth = 8, alpha = .3, zorder =  0)
    plt.axvline(locs[0], label='ZG(1):{}'.format(ds[0]), color=colors[0], ls='--', zorder =  1)
    plt.axvline(locs[1], label='ZG(2):{}'.format(ds[1]), color=colors[0], ls=':', zorder =  1)
    plt.axvline(locs[2], label='minka:{}'.format(ds[2]), color=colors[1], ls='--', zorder =  1)
    plt.axvline(locs[3], label='ZG_hack(1):{}'.format(ds[3]), color= colors[2], ls='--', zorder =  1)
    plt.axvline(locs[4], label='ZG_hack(2):{}'.format(ds[4]), color=colors[2], ls=':', zorder =  1)
    plt.legend(bbox_to_anchor=(1, 1), loc='upper left',)
    

In [8]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from ya_pca.linalg_utils import svd_wrapper
from ya_pca.viz import scree_plot
import random

#Make a random array and then make it positive-definite
# true -- where we expect value to be 
# d1 -- ZG no hack elbow 1
# d2 -- ZG no hack elbow 2
# d3 -- minka 
# d4 -- ZG  hack elbow 1
# d5 -- ZG  hack elbow 2

def scree_basic(A):
    U, svals, V = svd_wrapper(A)
    plt.figure(figsize=(8, 8))
    colors = sns.color_palette()
    scree_plot(svals, color = colors[5])
    #set_xlim(bottom=0, top=max(svals))
    #plt.xlim(0)   
    axes = plt.gca()
    axes.set_xlim([.5, len(svals) + 1])
    axes.set_ylim([min(svals) - 2 ,max(svals) + 2])
    
    plt.xlabel('Principal Component')
    plt.ylabel('Eigenvalue')

In [9]:
from matplotlib import pyplot as plt
import os
all_sims_dims = []
all_dims= []
all_data = []
all_ys = []
all_sims_colors = []
all_trues = []
dims_minka_all = []
dims_ZG_1_all = []
dims_ZG_2_all = []
dims_ZG_h_1_all = []
dims_ZG_h_2_all = []


for i, task_id in enumerate(benchmark_suite.tasks):  # iterate over all tasks
    task = openml.tasks.get_task(task_id)  # download the OpenML task
    A, y = task.get_X_and_y()  # get the data
#     if np.isnan(np.sum(A)) or A.shape[1] > 200:
#         continue
    print(i, A.shape)
    
    dims_minka = []
    dims_ZG_1 = []
    dims_ZG_2 = []
    dims_ZG_h_1 = []
    dims_ZG_h_2 = []
    error = False
    for j in range(1):
        print(i, j)
        try:
            dims_minka.append(minka(A))
        except:
            print("minka")
            dims_minka.append(-1)
        
        try: 
            d = select_dimension(A, n_components = A.shape[1] - 1)
            dims_ZG_1.append(d[0][0])
            # due to the function not always returning two element list we only append the first 
            if len(d[0]) > 1:
                dims_ZG_2.append(d[0][1])
            else:
                dims_ZG_2.append(0)
        except:
            print("full ZG")
            dims_ZG_1.append(-1)
            dims_ZG_2.append(-1)
            
        
        
        try: 
            d_h = select_dimension(A)
            # due to the function not always returning two element list we only append the first
            dims_ZG_h_1.append(d_h[0][0])
            if len(d_h[0]) > 1:
                dims_ZG_h_2.append(d_h[0][1])
            else:
                dims_ZG_h_2.append(0)  
        except:
            print("hack ZG")
            dims_ZG_h_1.append(-1)
            dims_ZG_h_2.append(-1)   

    dims_ZG_1 = np.asarray(dims_ZG_1)
    dims_ZG_1_all.append(dims_ZG_1)

    dims_minka = np.asarray(dims_minka)
    dims_minka_all.append(dims_minka)

    dims_ZG_2 = np.asarray(dims_ZG_2)
    dims_ZG_2_all.append(dims_ZG_2)

    dims_ZG_h_1 = np.asarray(dims_ZG_h_1)
    dims_ZG_h_1_all.append(dims_ZG_h_1)

    dims_ZG_h_2 = np.asarray(dims_ZG_h_2)
    dims_ZG_h_2_all.append(dims_ZG_h_2)

    dims_ = [int(np.median(dims_ZG_1)), int(np.median(dims_ZG_2)), int(np.median(dims_minka)), int(np.median(dims_ZG_h_1)), int(np.median(dims_ZG_h_2))]
    all_dims.append(dims_)
    all_data.append(A)
    all_ys.append(y)
    # scree_2(A,dims_ZG_1,dims_ZG_2 , dims_minka, dims_ZG_h_1, dims_ZG_h_2)
    #plt.title('Scree Plot: ' + str(i))

    # direc = r'D:\Hopkins\Hopkins_senior\Neurodata\ndd_prac\ndd_stuff\sprint2\scree_plts_no_dim_select'
    # full_dir = os.path.join(direc, 'fig' + str(i) + '.png')
    # plt.savefig(full_dir)
    # plt.show()

import pickle
with open('surv_dims.pkl', 'wb') as f:
    pickle.dump(all_dims, f)
    

with open('all_data.pkl', 'wb') as f:
    pickle.dump(all_data, f)

with open('all_ys.pkl', 'wb') as f:
    pickle.dump(all_ys, f)

0 (3196, 36)
0 0
1 (20000, 16)
1 0
2 (625, 4)
2 0
3 (2000, 216)
3 0
4 (2000, 76)
4 0
5 (699, 9)
5 0
minka
6 (2000, 64)
6 0
7 (2000, 6)
7 0
8 (2000, 47)
8 0
9 (1473, 9)
9 0
10 (5620, 64)
10 0
11 (690, 15)
11 0
minka
12 (1000, 20)
12 0
13 (10992, 16)
13 0
14 (768, 8)
14 0
15 (4601, 57)
15 0
16 (3190, 60)
16 0
17 (958, 9)
17 0
18 (846, 18)
18 0
19 (45312, 8)
19 0
20 (6430, 36)
20 0
21 (736, 19)
21 0
minka
22 (3772, 29)
22 0
minka
23 (990, 12)
23 0
24 (7797, 617)
24 0
25 (841, 70)
25 0
26 (797, 4)
26 0
27 (70000, 784)
27 0
minka
28 (1458, 37)
28 0
minka
29 (1563, 37)
29 0
minka
30 (10885, 21)
30 0
minka
31 (522, 21)
31 0
32 (2109, 21)
32 0
33 (1109, 21)
33 0
34 (48842, 14)
34 0
minka
35 (3751, 1776)
35 0
minka
36 (569, 30)
36 0
37 (5404, 5)
37 0
38 (1055, 41)
38 0
39 (5456, 24)
39 0
40 (1593, 256)
40 0
41 (583, 10)
41 0
42 (2600, 500)
42 0
43 (34465, 118)
43 0
minka
44 (2534, 72)
44 0
45 (1080, 856)
45 0
minka
46 (6118, 51)
46 0
47 (1372, 4)
47 0
48 (748, 4)
48 0
49 (11055, 30)
49 0
50 (54

In [10]:
with open('dims_ZG_1_all.pkl', 'wb') as f:
    pickle.dump(dims_ZG_1_all, f)
    
with open('dims_minka_all.pkl', 'wb') as f:
    pickle.dump(dims_minka_all, f)
    
with open('dims_ZG_2_all.pkl', 'wb') as f:
    pickle.dump(dims_ZG_2_all, f)
    
with open('dims_ZG_h_1_all.pkl', 'wb') as f:
    pickle.dump(dims_ZG_h_1_all, f)
    
with open('dims_ZG_h_2_all.pkl', 'wb') as f:
    pickle.dump(dims_ZG_h_2_all, f)
    



In [13]:
len(all_data)

72