# Experiments

## Colab setup

In [1]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

1.11.0+cu113
[K     |████████████████████████████████| 7.9 MB 3.0 MB/s 
[K     |████████████████████████████████| 3.5 MB 2.4 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [2]:
!pip install ogb
!pip install grandiso

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ogb
  Downloading ogb-1.3.3-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.4 MB/s 
Collecting outdated>=0.2.0
  Downloading outdated-0.2.1-py3-none-any.whl (7.5 kB)
Collecting littleutils
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
Building wheels for collected packages: littleutils
  Building wheel for littleutils (setup.py) ... [?25l[?25hdone
  Created wheel for littleutils: filename=littleutils-0.2.2-py3-none-any.whl size=7048 sha256=7fdbbbb6a66f0a35c546555d3775437e416c645aa6dbf943fe440910ae34800e
  Stored in directory: /root/.cache/pip/wheels/d6/64/cd/32819b511a488e4993f2fab909a95330289c3f4e0f6ef4676d
Successfully built littleutils
Installing collected packages: littleutils, outdated, ogb
Successfully installed littleutils-0.2.2 ogb-1.3.3 outdated-0.2.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/pub

Loading drive:

In [182]:
from google.colab import drive
drive.mount('/content/drive')
# Mount the current directory
%cd /content/drive/My\ Drive//CS159_project/Graph_homomorphism/graph_homomorphism

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/CS159_project/Graph_homomorphism/graph_homomorphism


### imports

In [168]:
import torch
from torch_geometric.data import Data
from torch_geometric.transforms import BaseTransform
from torch_geometric.datasets import TUDataset, ZINC
from ogb.graphproppred import PygGraphPropPredDataset
import torch_geometric.utils as uts
from torch_geometric.utils import remove_self_loops, to_undirected

import numpy as np
import matplotlib.pyplot as plt
import itertools
from tqdm import tqdm

In [183]:
# Graph utilities
import networkx as nx
import lib.graph_encoding.encoding as encoding 


In [184]:
from importlib import reload 

encoding = reload(encoding)


In [116]:
# sklearn imports
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

## Helper functions

In [191]:
# set up embedding 

def add_testgraphs(encoded_data, limit_vertex = None,
            n_trees = 4, limit_trees = 10000,
            n_cycles = 4,limit_cycles = 10000, 
            n_cliques = 3, limit_cliques = 100):
  
    encoded_data.clear_all_testgraphs()
    encoded_data.add_single_vertex(limit = limit_vertex)
    encoded_data.add_trees(stop = n_trees, limit = limit_trees)
    encoded_data.add_cycles(stop = n_cycles, limit = limit_cycles)
    encoded_data.add_cliques(stop = n_cliques, limit = limit_cliques)
  

  

helper functions for model fitting:

In [35]:
# calculate fit and plot scores

def calculate_single_split_score(clf, X, y, cv_num, 
                                 scoring='accuracy', test_size=0.25):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                      test_size=test_size, random_state=42)
    clf.fit(X_train, y_train)

    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)

    return {'train_score' : train_score,'test_score': test_score}
    

def calculate_cv_scores(clf, X, y, cv_num, scoring='accuracy'):
  cv = cv_num
  scores = cross_val_score(clf, X, y, cv=cv_num, scoring = scoring)

  return scores

def plot_cv_scores(scores, clf_name, cv_num):
  width = 0.35
  labels = [f'G{n}' for n in range(1,cv_num+1)]
  fig = plt.figure()
  ax = fig.add_subplot()
  ax.bar(labels, scores, width)
  ax.set_ylabel('Scores')
  ax.set_title('Cross validation scores for '+clf_name)
  plt.axhline(y = scores.mean(), c = 'black', linewidth = 0.7, 
            label = f'Err = {scores.mean():.2f}' + u"\u00B1" + f'{scores.std():.2f}')
  ax.legend()

  plt.show()
  print(f'Validation error = {scores.mean():.2f}' + u"\u00B1" + f'{scores.std():.2f}')

## Experiments: graph classification tasks

### First experiment: MUTAG:

In [92]:
# load the data:

dataset = TUDataset(root='data/TUDataset', name='MUTAG')

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')


Dataset: MUTAG(188):
Number of graphs: 188
Number of features: 7
Number of classes: 2

Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])


In [100]:
# pre-processing MUTAG
Encoded_Dataset = [encoding.grandEmbedding(data) for data in tqdm(dataset)]

100%|██████████| 188/188 [00:00<00:00, 19927.95it/s]


In [101]:
f = [add_testgraphs(data) for data in Encoded_Dataset]

In [102]:
f = [add_testgraphs(data) for data in Encoded_Dataset]
# get representation MUTAG
# labels
y = np.array([data.pyg_graph().y.detach().numpy() for data in Encoded_Dataset])
# vectors
#X = np.array([data.ghc_encoder(format = 'numpy')  for data in tqdm(Encoded_Dataset)])
%timeit Encoded_Dataset[0].tensor_v_encoder(format = 'Torch')

10 loops, best of 5: 31 ms per loop


In [None]:
y = y.reshape(188)

### Experiment: "ogbg-molhiv" 

In [None]:
# setup the provided node encoder
from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
emb_dim = 10
atom_encoder = AtomEncoder(emb_dim)
bond_encoder = BondEncoder(emb_dim)

class atom_transform(BaseTransform):
  def __call__(self, data):
    newdata = data.clone()
    newdata.x = atom_encoder(data.x)
    return newdata

transform = atom_transform()

In [None]:
# load the data 
dataset = PygGraphPropPredDataset(name = "ogbg-molhiv", root = 'dataset/', transform = transform)

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[7]  # Get the first graph object.

print()
print(data)
print('=============================================================')


Dataset: PygGraphPropPredDataset(41127):
Number of graphs: 41127
Number of features: 10
Number of classes: 2

Data(edge_index=[2, 38], edge_attr=[38, 3], x=[18, 10], y=[1, 1], num_nodes=18)


In [None]:
f'{dataset}'

'PygGraphPropPredDataset(41127)'

In [None]:
encoded_dataset = [encoding.grandEmbedding(data) for data in tqdm(dataset)]

In [None]:
def single_graph_data(i, encoder, file_name,
                      n_cliques, n_cycles, n_trees):
  add_graphs = lambda x: add_testgraphs(encoded_data = x,                                           n_trees= n_trees, limit_trees= 10000,                                              n_cycles=n_cycles, limit_cycles = 10000,
                                      n_cliques= n_cliques, limit_cliques=100)
  add_to_Dataset = [add_graphs(data) for data in tqdm(encoded_dataset)]
  # labels
  y = np.array([data.pyg_graph().y[0,0].detach().numpy() for data in tqdm(encoded_dataset)])
  #vectors
  X = np.array([encoder(data)  for data in tqdm(encoded_dataset)])
  nums = np.array([n_cliques, n_cycles, n_trees])
  np.save(file_name +f'{i}_X.npy', X )
  np.save(file_name + f'{i}_y.npy', y )
  np.save(file_name +f'{i}_nums.npy', nums)


def gather_graph_data(encoder, file_name, cliques_limit = 5, 
                      cycles_limit = 6, trees_limit = 6):
  i = 0
  for n_cliques in range(4,cliques_limit):
    for n_cycles in range(3,cycles_limit):
        for n_trees in range(2, trees_limit):

          add_graphs = lambda x: add_testgraphs(encoded_data = x, 
                                                n_trees= n_trees, limit_trees= 10000,
                                                n_cycles=n_cycles, limit_cycles = 10000,
                                                n_cliques= n_cliques, limit_cliques=100)
          add_to_Dataset = [add_graphs(data) for data in tqdm(encoded_dataset)]
          # labels
          y = np.array([data.pyg_graph().y[0,0].detach().numpy() for data in tqdm(encoded_dataset)])
          #vectors
          X = np.array([encoder(data)  for data in tqdm(encoded_dataset)])
          nums = np.array([n_cliques, n_cycles, n_trees])
          np.save(file_name +f'{i}_X.npy', X )
          np.save(file_name + f'{i}_y.npy', y )
          np.save(file_name +f'{i}_nums.npy', nums)
          i+=1   


In [None]:
# GHC:
file_name = 'Experiments/ogbg-molhiv/GHC_encoded_data/experiment_'
encdoer = lambda x: x.ghc_encoder(format = 'numpy')
gather_graph_data(encdoer, file_name)

In [None]:
# GHC with augmentation:
file_name = 'Experiments/ogbg-molhiv/ghc_aug/experiment_'
ghc = lambda x: x.ghc_encoder(format = 'numpy')
num_enc = lambda x: x.num_encoder(format = 'numpy')
encoder = lambda x: np.concatenate((ghc(x),num_enc(x)), axis = 0)
gather_graph_data(encdoer, file_name)

In [None]:
# Lagrangian with augmentation:
file_name = 'Experiments/ogbg-molhiv/lagrangian_aug/experiment_'
raw_encoder = lambda x: x.lagrangian_encoder(format = 'numpy')
num_enc = lambda x: x.num_encoder(format = 'numpy')
encoder = lambda x: np.concatenate((raw_encoder(x),num_enc(x)), axis = 0)
single_graph_data(800, encoder, file_name,
                  n_cliques = 4, n_cycles = 5, n_trees = 2)
#gather_graph_data(encdoer, file_name)

100%|██████████| 41127/41127 [00:02<00:00, 16755.95it/s]
100%|██████████| 41127/41127 [00:00<00:00, 46463.91it/s]
100%|██████████| 41127/41127 [06:54<00:00, 99.15it/s]


#### Evaluation

In [None]:
def set_up_experiment(X_train, y_train, X_valid, y_valid):
    C_array = [10**3, 10**2, 10** 1, 10**0, 10**-1, 10**-2, 10**-3]

    max_score = 0
    C_max = 0
    for C in C_array:
        clf = make_pipeline(StandardScaler(), SVC(kernel='rbf', C = 1, probability = True, random_state=42))
        # fit model
        clf.fit(X_train, y_train)
        # Calculate predictions 
        y_valid_pred = clf.predict_proba(X_valid)

        y_pred_valid = y_valid_pred[:,1].reshape(4113,1)
        y_true_valid = y_valid.reshape(4113,1)

        input_valid_dict = {'y_true': y_true_valid, 'y_pred': y_pred_valid}
        valid_score = evaluator.eval(input_valid_dict)['rocauc']

        if valid_score > max_score:
            max_score = valid_score
            C_max = C
        return {'max_score': max_score, 'C_max' : C_max}

In [None]:
from ogb.graphproppred import Evaluator

evaluator = Evaluator(name = 'ogbg-molhiv')
print(evaluator.expected_input_format) 
print(evaluator.expected_output_format)  

==== Expected input format of Evaluator for ogbg-molhiv
{'y_true': y_true, 'y_pred': y_pred}
- y_true: numpy ndarray or torch tensor of shape (num_graph, num_task)
- y_pred: numpy ndarray or torch tensor of shape (num_graph, num_task)
where y_pred stores score values (for computing AUC score),
num_task is 1, and each row corresponds to one graph.
nan values in y_true are ignored during evaluation.

==== Expected output format of Evaluator for ogbg-molhiv
{'rocauc': rocauc}
- rocauc (float): ROC-AUC score averaged across 1 task(s)



In [None]:
#usual GHC
X_list = [np.load(f'Experiments/ogbg-molhiv/GHC_encoded_data/experiment_{i}_X.npy') for i in range(12)]
y_list = [np.load(f'Experiments/ogbg-molhiv/GHC_encoded_data/experiment_{i}_y.npy') for i in range(12)]
nums_list = [np.load(f'Experiments/ogbg-molhiv/GHC_encoded_data/experiment_{i}_nums.npy') for i in range(12)]


In [None]:
# GHC augmented
X_list = [np.load(f'Experiments/ogbg-molhiv/ghc_aug/experiment_{i}_X.npy') for i in range(12)]
y_list = [np.load(f'Experiments/ogbg-molhiv/ghc_aug/experiment_{i}_y.npy') for i in range(12)]
nums_list = [np.load(f'Experiments/ogbg-molhiv/ghc_aug/experiment_{i}_nums.npy') for i in range(12)]


In [None]:
X_list[0].shape, len(y_list), len(nums_list)

((41127, 11), 12, 12)

In [None]:
# Using the given train-test split

split_idx = dataset.get_idx_split()

train_idx = split_idx["train"]
valid_idx = split_idx["valid"]
test_idx  = split_idx["test"]

In [None]:
list(map(lambda x: x.shape, X_list))

[(41127, 11),
 (41127, 22),
 (41127, 33),
 (41127, 55),
 (41127, 22),
 (41127, 33),
 (41127, 44),
 (41127, 66),
 (41127, 33),
 (41127, 44),
 (41127, 55),
 (41127, 77)]

In [None]:
nums_list[11]

array([4, 5, 5])

In [None]:
def calculate_score_SVM(i):
  X = X_list[i]
  y = y_list[i]

  X_train , y_train = X[train_idx], y[train_idx]
  X_valid , y_valid = X[valid_idx], y[valid_idx]
  X_test , y_test = X[test_idx], y[test_idx]

  result_dict = set_up_experiment(X_train, y_train, X_valid, y_valid)

  best_val_score = result_dict['max_score']
  C_max = result_dict['C_max']

  clf = make_pipeline(StandardScaler(), SVC(kernel='rbf', C = C_max, probability = True, random_state=42))
  # fit model
  clf.fit(X_train, y_train)

  # calculate test auroc
  y_test_pred = clf.predict_proba(X_test)
  y_pred_test = y_test_pred[:,1].reshape(4113,1)
  y_true_test = y_test.reshape(4113,1)

  input_test_dict = {'y_true': y_true_test, 'y_pred': y_pred_test}
  test_score = evaluator.eval(input_test_dict)['rocauc']

  # calculate train auroc
  y_train_pred = clf.predict_proba(X_train)
  y_pred_train = y_train_pred[:,1].reshape(32901,1)
  y_true_train = y_train.reshape(32901,1)
  
  input_train_dict = {'y_true': y_true_train, 'y_pred': y_pred_train}
  train_score = evaluator.eval(input_train_dict)['rocauc']

  return [train_score, test_score, best_val_score, C_max]


In [None]:
for i in tqdm(range(12)): 
  svm_data = calculate_score_SVM(i)
  svm_data_array = np.array(svm_data)
  np.save(f'Experiments/ogbg-molhiv/svm_data_{i}.npy', svm_data_array)

100%|██████████| 12/12 [3:50:44<00:00, 1153.69s/it]


In [None]:
# load single experiment Lag
X = np.load(f'Experiments/ogbg-molhiv/lagrangian_aug/experiment_800_X.npy')
y = np.load(f'Experiments/ogbg-molhiv/lagrangian_aug/experiment_800_y.npy')
nums_list = np.load(f'Experiments/ogbg-molhiv/lagrangian_aug/experiment_800_nums.npy')

In [None]:
X.shape, y.shape, nums_list

((41127, 33), (41127,), array([4, 5, 2]))

#### Random forest eval:

In [None]:
# try PCA before evaluation
from sklearn.decomposition import PCA, IncrementalPCA

One random forest measuremnt:

> Indented block



In [None]:
def molhiv_calculate_score_single_forest(X, y):
  X_train , y_train = X[train_idx], y[train_idx]
  X_valid , y_valid = X[valid_idx], y[valid_idx]
  X_test , y_test = X[test_idx], y[test_idx]

  # preprocess pca
  if i > 1:
    pca = PCA(n_components = 20)
  else:
    pca = PCA()
  pca.fit(X_train, y_train)

  X_train_new = pca.transform(X_train)
  X_valid_new = pca.transform(X_valid)
  X_test_new = pca.transform(X_test)

  clf = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42))
  # fit model
  clf.fit(X_train_new, y_train)
  
  # calculate test auroc
  y_test_pred = clf.predict_proba(X_test_new)
  y_pred_test = y_test_pred[:,1].reshape(4113,1)
  y_true_test = y_test.reshape(4113,1)

  input_test_dict = {'y_true': y_true_test, 'y_pred': y_pred_test}
  test_score = evaluator.eval(input_test_dict)['rocauc']

  # calculate valid auroc
  y_valid_pred = clf.predict_proba(X_valid_new)
  y_pred_valid = y_valid_pred[:,1].reshape(4113,1)
  y_true_valid = y_valid.reshape(4113,1)

  input_valid_dict = {'y_true': y_true_valid, 'y_pred': y_pred_valid}
  valid_score = evaluator.eval(input_valid_dict)['rocauc']

  # calculate train auroc
  y_train_pred = clf.predict_proba(X_train_new)
  y_pred_train = y_train_pred[:,1].reshape(32901,1)
  y_true_train = y_train.reshape(32901,1)
  
  input_train_dict = {'y_true': y_true_train, 'y_pred': y_pred_train}
  train_score = evaluator.eval(input_train_dict)['rocauc']

  return [train_score, valid_score, test_score]

In [None]:
def molhiv_calculate_score_random_forest(i):
  X = X_list[i]
  y = y_list[i]

  return molhiv_calculate_score_single_forest(X,y)

In [None]:
# run random forest
for i in tqdm(range(12)): 
  random_forest_data = molhiv_calculate_score_random_forest(i)
  random_forest_data_array = np.array(random_forest_data)
  np.save(f'Experiments/ogbg-molhiv/random_forest_scores_{i}.npy', random_forest_data_array)

100%|██████████| 12/12 [04:23<00:00, 21.93s/it]


### Experiment: NCI-1

In [151]:
class nci1_transform(BaseTransform):
   def __call__(self, data):
     new_data = data.clone()
     w = torch.rand(37)
     new_data.x = torch.unsqueeze(torch.tensordot(data.x, w,  dims=([1], [0])),1)
     return new_data
     
transform = nci1_transform()


In [185]:
# load the data:

dataset = TUDataset(root='data/TUDataset', name='NCI1')#, transform=transform)

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')


Dataset: NCI1(4110):
Number of graphs: 4110
Number of features: 37
Number of classes: 2

Data(edge_index=[2, 42], x=[21, 37], y=[1])


In [192]:
encoded_dataset = [encoding.grandEmbedding(data) for data in tqdm(dataset)]

  0%|          | 4/4110 [07:28<128:00:10, 112.23s/it]
100%|██████████| 4110/4110 [00:00<00:00, 13226.70it/s]


In [193]:
encoded_dataset[0].pyg_graph().x[0]

tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.])

In [194]:
f = [add_testgraphs(data) for data in encoded_dataset]

In [200]:
[x.num_nodes() for x in encoded_dataset[0].testgraphs]

[3, 3, 1, 2]

In [201]:
# get representation MUTAG
# labels
y = np.array([data.pyg_graph().y.detach().numpy() for data in encoded_dataset])
# vectors
#X = [data.tensor_v_encoder()  for data in tqdm(encoded_dataset)]
[i.size() for i in encoded_dataset[0].tensor_v_encoder(format = 'Torch')]

[torch.Size([37]), torch.Size([9139]), torch.Size([37]), torch.Size([703])]

In [None]:
def TU_graph_data(i, encoder, file_name,
                      n_cliques, n_cycles, n_trees):
  add_graphs = lambda x: add_testgraphs(encoded_data = x,                                           n_trees= n_trees, limit_trees= 10000,                                              n_cycles=n_cycles, limit_cycles = 10000,
                                      n_cliques= n_cliques, limit_cliques=100)
  add_to_Dataset = [add_graphs(data) for data in tqdm(encoded_dataset)]
  # labels
  y = np.array([data.pyg_graph().y.detach().numpy() for data in tqdm(encoded_dataset)])
  #vectors
  X = np.array([encoder(data)  for data in tqdm(encoded_dataset)])
  nums = np.array([n_cliques, n_cycles, n_trees])
  np.save(file_name +f'{i}_X.npy', X )
  np.save(file_name + f'{i}_y.npy', y )
  np.save(file_name +f'{i}_nums.npy', nums)

In [None]:
# ghc gather data
file_name = 'Experiments/NCI1/ghc/experiment_'
pure_encoder = lambda x: x.ghc_encoder(format = 'numpy')
#num_enc = lambda x: x.num_encoder(format = 'numpy')
#encoder = lambda x: np.concatenate((pure_encoder(x),num_enc(x)), axis = 0)
encoder = pure_encoder

i = 0
for n_cliques in range(4,6):
  for n_cycles in range(3,8):
      for n_trees in range(2, 10):
        TU_graph_data(i, encoder, file_name,
                 n_cliques = n_cliques, n_cycles = n_cycles, n_trees = n_trees)
        i+=1

In [None]:
# Lagrangian gather data:
file_name = 'Experiments/NCI1/lagrangian_aug/experiment_'
pure_encoder = lambda x: x.lagrangian_encoder(format = 'numpy')
num_enc = lambda x: x.num_encoder(format = 'numpy')
encoder = lambda x: np.concatenate((pure_encoder(x),num_enc(x)), axis = 0)

i = 0
for n_cliques in range(4,6):
  for n_cycles in range(3,6):
      for n_trees in range(2, 6):
        TU_graph_data(i, encoder, file_name,
                 n_cliques = n_cliques, n_cycles = n_cycles, n_trees = n_trees)
        i+=1


100%|██████████| 4110/4110 [00:00<00:00, 69584.52it/s]

100%|██████████| 4110/4110 [00:00<00:00, 117385.90it/s]

  0%|          | 0/4110 [00:00<?, ?it/s][A
  2%|▏         | 64/4110 [00:00<00:06, 639.02it/s][A
  3%|▎         | 128/4110 [00:00<00:07, 516.88it/s][A
  4%|▍         | 182/4110 [00:00<00:07, 525.87it/s][A
  6%|▌         | 237/4110 [00:00<00:07, 528.89it/s][A
  7%|▋         | 291/4110 [00:00<00:07, 516.74it/s][A
  8%|▊         | 346/4110 [00:00<00:07, 524.76it/s][A
 10%|▉         | 401/4110 [00:00<00:06, 530.16it/s][A
 11%|█         | 455/4110 [00:00<00:07, 517.92it/s][A
 12%|█▏        | 512/4110 [00:00<00:06, 532.88it/s][A
 14%|█▍        | 568/4110 [00:01<00:06, 540.88it/s][A
 15%|█▌        | 623/4110 [00:01<00:06, 511.52it/s][A
 16%|█▋        | 675/4110 [00:01<00:06, 505.41it/s][A
 18%|█▊        | 732/4110 [00:01<00:06, 521.16it/s][A
 19%|█▉        | 785/4110 [00:01<00:06, 521.30it/s][A
 20%|██        | 838/4110 [00:01<00:06, 507.17it/s][A
 22%|██▏       | 8

In [None]:
# load single experiment Lag
def load_data_nci1(i, file_name):
  X = np.load(file_name+f'{i}_X.npy')
  y = np.load(file_name+f'{i}_y.npy').reshape(4110,)
  nums = np.load(file_name+f'{i}_nums.npy')
  return {'X': X, 'y': y, 'nums': nums}

In [None]:
# Lagrangian load data:
num_of_exps = 9
file_name = 'Experiments/NCI1/lagrangian_aug/experiment_'
X_list_lag = [load_data_nci1(i, file_name)['X'] for i in range(num_of_exps)]
y_list_lag = [load_data_nci1(i, file_name)['y'] for i in range(num_of_exps)]
nums_list_lag = [load_data_nci1(i, file_name)['nums'] for i in range(num_of_exps)]

In [None]:
X_list_lag[1].shape, y_list_lag[0].shape, nums_list_lag[0].shape

In [None]:
# ghc load data:
num_of_exps = 5
file_name = 'Experiments/NCI1/ghc/experiment_'
X_list_ghc = [load_data_nci1(i, file_name)['X'] for i in range(num_of_exps)]
y_list_ghc = [load_data_nci1(i, file_name)['y'] for i in range(num_of_exps)]
nums_list_ghc = [load_data_nci1(i, file_name)['nums'] for i in range(num_of_exps)]

In [None]:
# lagrangian:
X_list = X_list_lag
y_list = y_list_lag
nums_list = nums_list_lag

In [None]:
# ghc:
X_list = X_list_ghc
y_list = y_list_ghc
nums_list = nums_list_ghc

In [None]:
# 
components = [100 if i>1 else 30 for i in range(num_of_exps)]
base_clf = RandomForestClassifier(random_state=42)
#clf_list = [make_pipeline(StandardScaler(), PCA(n_components = n),
 #               base_clf) for n in components]
clf_list = [make_pipeline(StandardScaler(), base_clf) for n in components]

cv_scores = [calculate_cv_scores(clf_list[i], X_list[i], y_list[i], 10)
              for i in range(num_of_exps)]

In [None]:
experiment_name = 'lagrangian_aug/cv_scores_RF_no_PCA.npy'
scors_arr = np.zeros((num_of_exps, 2))
for i, scores in enumerate(cv_scores):
  mean = scores.mean()
  std = scores.std()
  scors_arr[i] = np.array([mean, std])
  print(f'Err = {mean:.2f}' + u"\u00B1" + f'{std:.2f} \n')

np.save('Experiments/NCI1/'+experiment_name, scors_arr)

Err = 0.65±0.03 

Err = 0.69±0.04 

Err = 0.72±0.04 

Err = 0.74±0.04 

Err = 0.74±0.04 

Err = 0.75±0.04 



In [None]:
expr = 3
clf = clf_list[expr]
X = X_list[expr]
y = y_list[expr]
calculate_single_split_score(clf, X, y, cv_num = 10)

{'test_score': 0.8054474708171206, 'train_score': 0.9967553536664504}

In [None]:
#pca = PCA(n_components = 100)
#pca.fit(X_train, y_train)

#X_train_new = pca.transform(X_train)
#X_test_new = pca.transform(X_test)

#clf = make_pipeline(StandardScaler(), SVC(random_state=42, C = 100))
#clf = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42))
# fit model
#clf.fit(X_train_new, y_train)

#train_score = clf.score(X_train_new, y_train)
#test_score = clf.score(X_test_new, y_test)