In [1]:
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from sklearn.model_selection import KFold

from joblib import Parallel, delayed


In [2]:
def predict(X_train, X_val, model):

    nr_train_embeddings, dim = X_train.shape
    
    # normalize the embeddings in case they weren't already
    X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0)
    X_val = (X_val - X_val.mean(axis=0)) / X_val.std(axis=0)
    

        
    scores = []
        
    for i in range(dim):
        
        y_i_train = X_train[:,i]
        X_i_train = np.delete(X_train, i, 1)
    
        
        y_i_val = X_val[:,i]
        X_i_val = np.delete(X_val, i, 1)
        
        
        if model == 'linear':
            clf = SGDRegressor()
        elif model == 'xgb':
            clf = xgb.XGBRegressor() # tree_method='gpu_hist', gpu_id=0        
        elif model == 'mlp':
            clf = MLPRegressor()    
            
        clf.fit(X_i_train, y_i_train)
    
        scores.append(mean_squared_error(y_i_val, clf.predict(X_i_val)))
        
    return np.mean(scores), np.std(scores)
        


In [7]:
# find all folders in the 'correlation_analysis' folder
# for every folder we extract the path, the model name, the wandb run id and the number and end up creating a dataframe
all_folders = glob.glob('correlation_analysis/*')

all_data = pd.DataFrame(columns=['path', 'model_name', 'dataset_name', 'data_split', 'wandb_run_id'])

for folder in all_folders:
    path = Path(folder) #/ Path('data_standardized.csv')
    model_name = path.name 
    model_name = model_name.split('-')[0]
    dataset_name = path.name.split('-')[1].split('_')[0]
    data_split = path.name.split('_')[-1]
    wandb_run_id = path.name.split('_')[-2]
    all_data.loc[len(all_data)] = [path, model_name, dataset_name, data_split, wandb_run_id]
    

all_data

Unnamed: 0,path,model_name,dataset_name,data_split,wandb_run_id
0,correlation_analysis/vicreg-imagenet100_pxqwg3...,vicreg,imagenet100,train,pxqwg3ex
1,correlation_analysis/vicreg-imagenet100_pxqwg3...,vicreg,imagenet100,val,pxqwg3ex
2,correlation_analysis/vicreg-cifar100_000hudwm_...,vicreg,cifar100,train,000hudwm
3,correlation_analysis/vicreg-cifar100_000hudwm_val,vicreg,cifar100,val,000hudwm
4,correlation_analysis/vicreg-cifar100_31yfnww4_...,vicreg,cifar100,train,31yfnww4
...,...,...,...,...,...
71,correlation_analysis/simclr-cifar100_3kh01qs8_val,simclr,cifar100,val,3kh01qs8
72,correlation_analysis/simclr-cifar10_y6674tsl_t...,simclr,cifar10,train,y6674tsl
73,correlation_analysis/simclr-cifar10_y6674tsl_val,simclr,cifar10,val,y6674tsl
74,correlation_analysis/simclr-cifar100_5dgztcwi_...,simclr,cifar100,train,5dgztcwi


In [8]:
# for every entry in the dataframe we load the dataframe and calculate the correlation sum
for index, row in all_data.iterrows():
    final_path = row['path']/Path('data_standardized.csv')
    print(final_path)
    X = pd.read_csv(final_path)
    print(X.shape)
    X = X.to_numpy()

    kf = KFold(n_splits=2)
    print(row['model_name'], row['dataset_name'])
    for i, (train, test) in enumerate(kf.split(X)):
        for model in ['linear', 'xgb','mlp']:
            mean, std = predict(X[train], X[test], model=model)
            all_data.loc[index, f'mean_{model}_{i}'] = mean
            all_data.loc[index, f'std_{model}_{i}'] = std


all_data.to_csv("all_results.csv")

correlation_analysis/vicreg-imagenet100_pxqwg3ex_train/data_standardized.csv
(126591, 512)
vicreg imagenet100


KeyboardInterrupt: 

In [9]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold

def worker_function(args):
    index, row = args
    final_path = row['path']/Path('data_standardized.csv')
    X = pd.read_csv(final_path)
    X = X.to_numpy()

    kf = KFold(n_splits=2)
    results = {}
    for i, (train, test) in enumerate(kf.split(X)):
        for model in ['linear', 'xgb', 'mlp']:
            mean, std = predict(X[train], X[test], model=model)  # Assuming 'predict' function is defined elsewhere
            results[f'mean_{model}_{i}'] = mean
            results[f'std_{model}_{i}'] = std

    return index, results


# Parallelize using Pool
results = Parallel(n_jobs=-1)(delayed(worker_function)(row) for _, row in all_data.iterrows())

# Update the dataframe with the results
for index, result in results:
    for key, value in result.items():
        all_data.loc[index, key] = value

all_data.to_csv("all_results.csv")


KeyboardInterrupt: 

In [None]:
all_data