In [39]:
import os, copy, sys
import numpy as np
import pandas as pd
from typing import List, Tuple
import matplotlib.pyplot as plt
from scipy import stats

In [40]:
# Run this cell to remove pool_alice_code from package namespace, and reimport it with most recent changes
sys.modules.pop("pool_alice_code",None)
import pool_alice_code

## Load data

In [41]:
os.getcwd()

'/home/ethan/02750-automation/automation_final_project/P_ALICE'

In [42]:
X: np.ndarray = np.load('../data/abalone_age/X.npy')
y: np.ndarray  = np.load('../data/abalone_age/y.npy')
columns='Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight'.split(',')
X_df = pd.DataFrame(X,columns=columns)
num_original_features = len(X[0])

In [43]:
X_df.shape

(4177, 8)

# constants

In [44]:
BATCH_SIZE = 3
np.random.seed(1234)

# P-ALICE, weighted linear regression

In [45]:
X_pool_linear = np.vstack((X.T,np.ones(len(X)))).T
X_pool_linear

array([[0.    , 0.455 , 0.365 , ..., 0.101 , 0.15  , 1.    ],
       [0.    , 0.35  , 0.265 , ..., 0.0485, 0.07  , 1.    ],
       [1.    , 0.53  , 0.42  , ..., 0.1415, 0.21  , 1.    ],
       ...,
       [0.    , 0.6   , 0.475 , ..., 0.2875, 0.308 , 1.    ],
       [1.    , 0.625 , 0.485 , ..., 0.261 , 0.296 , 1.    ],
       [0.    , 0.71  , 0.555 , ..., 0.3765, 0.495 , 1.    ]])

In [46]:
idxs_samples = list(range(0,len(X_pool_linear)))
idxs_samples[-1]

4176

In [47]:
idxs_random_batch = np.random.choice(idxs_samples,size=BATCH_SIZE,replace=False)
random_batch = X_pool_linear[idxs_random_batch]
random_batch.shape

(3, 9)

In [48]:
def get_batch_correlation_v1(batch: np.ndarray)->np.float_:

    results = np.array(stats.spearmanr(a=batch,axis=1))
    spearmean_coefs = results[0,:,:]
    n,_ = batch.shape

    assert spearmean_coefs.shape == (n,n), 'NOT computing sample correlations!'

    # make the diagonal np.nan so it will not contribute to the mean
    np.fill_diagonal(spearmean_coefs, np.nan, wrap=False)

    return np.nanmean(spearmean_coefs)

def get_batch_correlation_v2(batch: np.ndarray)->np.float_:

    results = np.array(stats.spearmanr(a=batch,axis=1))
    spearmean_coefs = results[0,:,:]
    n,_ = batch.shape

    assert spearmean_coefs.shape == (n,n), 'NOT computing sample correlations!'

    
    iu1 = np.triu_indices(n=n,k=1)
    print(f"spearmean_coefs=\n{spearmean_coefs}")
    print(f"spearmean_coefs[iu1]=\n{spearmean_coefs[iu1]}")

    return np.nanmean(spearmean_coefs[iu1])

def get_batch_correlation_v3(batch: np.ndarray)->np.float_:

    pearson_coefs = np.corrcoef(batch)
    n,_ = batch.shape

    iu1 = np.triu_indices(n=n,k=1)
    assert pearson_coefs.shape == (n,n), 'NOT computing sample correlations!'
    print(f"pearson_coefs=\n{pearson_coefs}")
    print(f"pearson_coefs[iu1]=\n{pearson_coefs[iu1]}")

    return np.mean(pearson_coefs[iu1])

In [49]:
print(f"get_batch_correlation_v1(random_batch)={get_batch_correlation_v1(random_batch)}\n")
print(f"get_batch_correlation_v2(random_batch)={get_batch_correlation_v2(random_batch)}\n")
print(f"get_batch_correlation_v3(random_batch)={get_batch_correlation_v3(random_batch)}")

get_batch_correlation_v1(random_batch)=0.5222222222222223

spearmean_coefs=
[[1.         0.9        0.38333333]
 [0.9        1.         0.28333333]
 [0.38333333 0.28333333 1.        ]]
spearmean_coefs[iu1]=
[0.9        0.38333333 0.28333333]
get_batch_correlation_v2(random_batch)=0.5222222222222223

pearson_coefs=
[[ 1.          0.925346    0.1636757 ]
 [ 0.925346    1.         -0.01536752]
 [ 0.1636757  -0.01536752  1.        ]]
pearson_coefs[iu1]=
[ 0.925346    0.1636757  -0.01536752]
get_batch_correlation_v3(random_batch)=0.357884724448785


In [50]:
dummy = np.load("../results/abalone_age_linear_palice_16.npy")
dummy.shape

(5, 63)

In [51]:
BATCH_SIZE = 8
idxs_random_batch = np.random.choice(idxs_samples,size=BATCH_SIZE,replace=False)
random_batch = X_pool_linear[idxs_random_batch]
print(f"random_batch.shape={random_batch.shape}")
print(f"get_batch_correlation_v1(random_batch)={get_batch_correlation_v1(random_batch)}\n")
print(f"get_batch_correlation_v2(random_batch)={get_batch_correlation_v2(random_batch)}\n")
print(f"get_batch_correlation_v3(random_batch)={get_batch_correlation_v3(random_batch)}")

random_batch.shape=(8, 9)
get_batch_correlation_v1(random_batch)=0.7940611861727136

spearmean_coefs=
[[1.         1.         0.55230609 0.91214188 0.74477639 0.98319328
  0.93277311 0.92887842]
 [1.         1.         0.55230609 0.91214188 0.74477639 0.98319328
  0.93277311 0.92887842]
 [0.55230609 0.55230609 1.         0.38333333 0.3        0.51883299
  0.46862335 0.4       ]
 [0.91214188 0.91214188 0.38333333 1.         0.93333333 0.87866878
  0.97908807 0.98333333]
 [0.74477639 0.74477639 0.3        0.93333333 1.         0.69456675
  0.89540533 0.9       ]
 [0.98319328 0.98319328 0.51883299 0.87866878 0.69456675 1.
  0.89915966 0.89540533]
 [0.93277311 0.93277311 0.46862335 0.97908807 0.89540533 0.89915966
  1.         0.99582462]
 [0.92887842 0.92887842 0.4        0.98333333 0.9        0.89540533
  0.99582462 1.        ]]
spearmean_coefs[iu1]=
[1.         0.55230609 0.91214188 0.74477639 0.98319328 0.93277311
 0.92887842 0.55230609 0.91214188 0.74477639 0.98319328 0.93277311
 0.92