<a href="https://colab.research.google.com/github/dkgithub/wiehl24/blob/main/skorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Working with PyTorch can become involved. There are many tool that try to avoid writimg out all the litlle details.
Most commonly used is lighning. Here, we use skorch. It provides a Keras like interface that interacts smoothly with sklearn.

In [49]:
#!rm -rf helpers # if an enforce reinstall is necessary
![ ! -d helpers ] && git clone --recursive https://github.com/dkgithub/erum_ml_school_helpers helpers
!pip install wget



In [50]:
!pip install wget torchinfo skorch livelossplot



In [51]:
# load the helpers package and other software
import helpers as hlp
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch
import torchinfo
import skorch as sk
from livelossplot import PlotLosses

In [53]:
#check for accelerators
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('torch',torch.__version__)
print('device type is',device)
if device == 'cuda' :print(torch.cuda.get_device_name())
from os import environ
if "COLAB_TPU_ADDR" in environ and environ["COLAB_TPU_ADDR"]:
  print("A TPU is connected.")


torch 2.1.0+cu121
device type is cpu


In [54]:
# first, we define a preprocessing function that (e.g.) takes the
# constiuents and returns another representation of them

#def preprocess_constituents(constituents):
#    return constituents[:, :120].reshape((-1, 480))

def preprocess_constituents(constituents):
  # sum all constituents to get jet 4-momenta
  c_sum=constituents.sum(axis=1)
  metric=np.array([1.,-1.,-1.,-1.]) #g_mu_nu
  # calculating invariants wrt. to jet
  c_inv=(constituents*metric*c_sum[:,None,:]).sum(axis=2)
  return c_inv



In [55]:
def getData(name=None,nFiles=None):
  if name not in ['train','valid','test']:
    print(f'Need a proper data split name')
    return
  if name == 'train' and nFiles == None: nFiles = 2
  else: nFiles = 1
  c_vectors, _, labels = hlp.data.load(name, stop_file=nFiles)
  # run the preprocessing
  c_vectors = preprocess_constituents(c_vectors)
  # create torch tensors from numpy arrays, map to float32,
  c_tensor      = torch.from_numpy(c_vectors).float()
  label_tensor  = torch.from_numpy(labels   ).float()
  print(f'Data {name} - length \t{len(c_tensor)} \tshape {c_tensor.shape}' )
  return c_tensor,label_tensor


In [None]:
# here, we define a function to construct the datasets
def makeDataset(name=None,nFiles=None):
  if name not in ['train','valid','test']:
    print(f'Need a proper data split name')
    return
  if name == 'train' and nFiles == None: nFiles = 2
  else: nFiles = 1
  c_vectors, _, labels = hlp.data.load(name, stop_file=nFiles)
  # run the preprocessing
  c_vectors = preprocess_constituents(c_vectors)
  # create torch tensors from numpy arrays, map to float32,
  # and move to GPU if available - device must be defined
  c_tensor      = torch.from_numpy(c_vectors).float().to(device)
  label_tensor  = torch.from_numpy(labels   ).float().to(device)
  # Then, we create a dataset from our tensors
  print(f'dataset {name} \tlength',len(label_tensor),'\tshape',c_tensor.shape)
  return torch.utils.data.TensorDataset(c_tensor,label_tensor)

In [56]:
data_train, label_train = getData('train')
data_valid, label_valid = getData('valid')
data_test,  label_test  = getData('test')


Data train - length 	100000 	shape torch.Size([100000, 200])
Data valid - length 	50000 	shape torch.Size([50000, 200])
Data test - length 	50000 	shape torch.Size([50000, 200])


In [59]:
# we construct a network
from torch import nn
class myModel(nn.Module):
  def __init__(self,in_size=200,mid_size=200,n_layers=5):
    super().__init__()
    self.in_size  = in_size
    self.mid_size = mid_size
    self.n_layers = n_layers
    self.inLay    = nn.Linear(in_size,mid_size)
    self.linears  = nn.ModuleList([nn.Linear(mid_size, mid_size) for i in range(n_layers)])
    self.bnorms   = nn.ModuleList([nn.BatchNorm1d(mid_size) for i in range(n_layers)])
    self.outLay   = nn.Linear(mid_size, 1)

  def forward(self, x):
    x = self.inLay(x)
    x = torch.relu(x)
    # ModuleList can act as an iterable, or be indexed using ints
    for i,lay in enumerate(self.linears):
      x = lay(x)
      self.bnorms[i](x)
      x = torch.relu(x)
    x = self.outLay(x)
    x = torch.sigmoid(x)
    return x

# we initiate the model
model=myModel()

Skorch works with callbacks. Callbacks are called at certain points in the processing loop. Especially: epoch start,epoch end, batch start and batch end. Most common callbacks, e.g. scoring are predefined.

In [44]:
from skorch.callbacks import EpochScoring,EpochTimer
auc = EpochScoring(scoring='roc_auc',  lower_is_better=False)
acc = EpochScoring(scoring='accuracy', lower_is_better=False)

Skorch likes to create train and valid split internally but we already have our data splitted. There is a helper function for this *situation*

In [None]:
from skorch.helper import predefined_split

net = NeuralNet(
    ...,
    train_split=predefined_split(valid_ds)
)
net.fit(train_ds)

Next, we define our classifier

In [46]:
from skorch import NeuralNetClassifier
net = NeuralNetClassifier(
    model,
    criterion=nn.BCELoss,
    lr=0.01,
    train_split=predefined_split(valid_ds)
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    max_epochs=10,
    callbacks=[LivePlot],
    callbacks=[acc,auc],
    device=device
)

In [47]:
net.fit(data_train, label_train)

  epoch    accuracy    roc_auc    train_loss    valid_acc    valid_loss     dur
-------  ----------  ---------  ------------  -----------  ------------  ------
      1      [36m0.8828[0m     [32m0.9149[0m        [35m0.5071[0m       [31m0.8828[0m        [94m0.4289[0m  5.9792
      2      [36m0.8853[0m     [32m0.9220[0m        [35m0.4129[0m       [31m0.8853[0m        [94m0.3834[0m  5.5010
      3      0.8842     [32m0.9230[0m        [35m0.3771[0m       0.8842        [94m0.3627[0m  5.4049
      4      [36m0.8915[0m     [32m0.9284[0m        [35m0.3576[0m       [31m0.8915[0m        [94m0.3399[0m  6.3148
      5      0.8898     [32m0.9298[0m        [35m0.3421[0m       0.8898        [94m0.3298[0m  5.7106
      6      0.8898     [32m0.9302[0m        [35m0.3318[0m       0.8898        [94m0.3228[0m  6.0488
      7      [36m0.8998[0m     [32m0.9322[0m        [35m0.3224[0m       [31m0.8998[0m        [94m0.3101[0m  5.1378
      8      0.8981

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=myModel(
    (inLay): Linear(in_features=200, out_features=200, bias=True)
    (linears): ModuleList(
      (0-4): 5 x Linear(in_features=200, out_features=200, bias=True)
    )
    (bnorms): ModuleList(
      (0-4): 5 x BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (outLay): Linear(in_features=200, out_features=1, bias=True)
  ),
)

In [33]:
data_train.shape,label_train.shape

(torch.Size([100000, 200]), torch.Size([100000]))