In [1]:
from pathlib import Path
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from NegativeClassOptimization import ml

In [2]:
def number_2_partitions(n):
    """All possible partitions of a number n into two values.
    This is used to allocate the number of neurons between two layers of a neural network."""

    partitions = [(n-i, i) for i in range(2, n//2+1)]
    return partitions

In [3]:
all_neurons_combination = []
for total_neurons in [5,10,20]:
    all_neurons_combination += number_2_partitions(total_neurons)
all_neurons_combination += [(b,a) for a,b in all_neurons_combination]

In [4]:
np.random.seed(0)
all_neurons_combination = np.array(all_neurons_combination)
np.random.shuffle(all_neurons_combination)
neurons_combination = all_neurons_combination[:14]


In [5]:
neurons_combination

array([[ 7,  3],
       [ 5, 15],
       [ 2,  3],
       [ 4,  6],
       [18,  2],
       [12,  8],
       [ 7, 13],
       [10, 10],
       [ 3, 17],
       [ 8, 12],
       [ 3,  7],
       [ 8,  2],
       [13,  7],
       [10, 10]])

In [6]:
ml.DNN(num_hidden_units=neurons_combination[0], input_dim=67*100, activation_function="relu")

DNN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_1): Linear(in_features=6700, out_features=7, bias=True)
  (activation): ReLU()
  (linear_2): Linear(in_features=7, out_features=3, bias=True)
  (linear_3): Linear(in_features=3, out_features=1, bias=True)
  (final): Sigmoid()
)

In [19]:
#maybe I will split the function into several but later:) 
def run_optimization(target, task, split=0, epochs=10, batch_size=8, momentum=0.9):
        path_to_target = Path('./data/processed') / target
        path_vs_task_split = path_to_target / task / f'split_{split}'
        path_to_train = path_vs_task_split / 'train.pkl'
        df = pd.read_pickle(path_to_train)
        X = np.array(df["X"].tolist())
        y = np.array(df["Y_binary"].tolist())

        kfl = KFold(n_splits=3, shuffle=True, random_state=42)
        param_search = []

        for neurons, activation, learning_rate in param_combinations:
                model = ml.DNN(num_hidden_units=neurons, input_dim=input_dim, activation_function=activation)
                fold_metrics = []
                for train_index, val_index in kfl.split(X):  # k-fold loop, we will average over it
                        X_train = X[train_index]
                        y_train = y[train_index]
                        X_val = X[val_index]
                        y_val = y[val_index]

                        X_tensor_train = torch.tensor(X_train, dtype=torch.float32)
                        y_tensor_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
                        X_tensor_val = torch.tensor(X_val, dtype=torch.float32) =
                        y_tensor_val = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)

                        dataset_train = TensorDataset(X_tensor_train, y_tensor_train)
                        dataset_val = TensorDataset(X_tensor_val, y_tensor_val)

                        train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True) #we need to be able to change batch size
                        val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

                        train_output = ml.train_for_ndb1(
                                epochs,
                                learning_rate,
                                train_loader=train_loader,
                                test_loader=val_loader,
                                model=model,
                                optimizer_type=optimizer_type,
                                momentum=momentum,
                                swa=True,
                                open_loader=None,
                        ) #we need to be able to change epochs

                        swa_model, model, online_metrics = train_output

                        acc = online_metrics[-1]["test_metrics"]["acc_closed"]
                        fold_metrics.append(acc)

                avg_metric = sum(fold_metrics) / len(fold_metrics)
                param_search.append((neurons, activation, learning_rate, avg_metric))

        return param_search


In [8]:
targets_balanced_list = ['Q96GD4', 'P49841', 'Q13627', 'P06239', 'Q13464', 'P11309']
tasks = ['vs_Weak', 'vs_all', 'vs_Non-binder']

param_combinations = list(itertools.product(neurons_combination, ["relu"], [0.001])) #it doesn't make sence not to give it to the function
input_dim=67*100
optimizer_type = "Adam"

ag_task_wide_dnn_params = []
for target in tqdm(targets_balanced_list, desc='Target'):
    for task in tqdm(tasks, desc='Task'):
        param_search = run_optimization(target, task, split=0, epochs=10, batch_size=8, momentum=0.9)
        df = pd.DataFrame(param_search, columns=['neurons', 'activation', 'learning_rate', 'accuracy'])
        df['task'] = task
        df['target'] = target
        ag_task_wide_dnn_params.append(df)
df_dnn_params = pd.concat(ag_task_wide_dnn_params)

Target:   0%|          | 0/6 [00:00<?, ?it/s]
Task:   0%|          | 0/3 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [16]:
df_dnn_params['neurons'] = df_dnn_params.neurons.astype(str)

In [17]:
df_dnn_params.groupby(['task', 'neurons']).mean().sort_values("accuracy", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,learning_rate,accuracy
task,neurons,Unnamed: 2_level_1,Unnamed: 3_level_1
vs_Non-binder,[ 7 13],0.001,0.855271
vs_Non-binder,[ 8 12],0.001,0.853353
vs_Non-binder,[ 3 17],0.001,0.85104
vs_Non-binder,[ 5 15],0.001,0.85004
vs_Non-binder,[10 10],0.001,0.848275
vs_Non-binder,[4 6],0.001,0.84667
vs_Non-binder,[12 8],0.001,0.84648
vs_Non-binder,[13 7],0.001,0.84473
vs_Non-binder,[3 7],0.001,0.838771
vs_Weak,[7 3],0.001,0.819353


In [None]:
#which plots to plot for tomorrow
#how I choose numbe rof neurons. boxplot by 