In [1]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utilities import *
import torch
import torch.nn as nn
import torch.optim as optim
import awkward as ak

In [13]:
class KernelSVM(nn.Module):
    def __init__(self, input_size=9801, output_size=1):
        super(KernelSVM, self).__init__()
        # The input_size here matches the flattened kernel size
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x1, x2):
        # Ensure x1 and x2 have the correct shape for an outer product
        x1 = x1.view(-1)   # Shape (99,)
        x2 = x2.view(-1)   # Shape (99,)

        # Compute the outer product (quadratic kernel) and square it
        kernel = torch.ger(x1, x2) ** 2  # Shape (99, 99)

        # Flatten the kernel to match the linear layer input size
        kernel_flat = kernel.view(-1)  # Shape (9801,)
        
        # Pass the flattened kernel through the linear layer
        return self.linear(kernel_flat)



In [11]:
DFs = import_data_files(["l1calo_hist_ZMUMU_extended.root","l1calo_hist_EGZ_extended.root"])

In [4]:
def split_data(data, labels, train_ratio=0.8):
    """
    Splits the data and labels into training and testing sets.

    Args:
        data (torch.Tensor): The input data tensor.
        labels (torch.Tensor): The corresponding labels tensor.
        train_ratio (float, optional): The ratio of data to use for training. Defaults to 0.8.

    Returns:
        tuple: A tuple of four tensors: (train_data, train_labels, test_data, test_labels)
    """

    # Determine the number of training samples
    num_train_samples = int(train_ratio * len(data))

    # Split the data and labels into training and testing sets
    train_data, test_data = torch.utils.data.random_split(data, [num_train_samples, len(data) - num_train_samples])
    train_labels, test_labels = torch.utils.data.random_split(labels, [num_train_samples, len(labels) - num_train_samples])
    print("Data type of training data:", train_data.dataset.dtype)
    print("Data type of training labels:", train_labels.dataset.dtype)
    print("Data type of testing data:", test_data.dataset.dtype)
    print("Data type of testing labels:", test_labels.dataset.dtype)
    return train_data, train_labels, test_data, test_labels

In [5]:
def hinge_loss(output, target):
    loss = torch.clamp(1 - output.t() * target, min=0)
    return torch.mean(loss)

In [6]:
def prepare_data(DFs):
    print("data loaded from files")
    print("converting data to numpy array...")
    accepted_numpy = ak.to_numpy(DFs[0]['SuperCell_ET'])
    rejected_numpy = ak.to_numpy(DFs[1]['SuperCell_ET'])
    print(accepted_numpy)
    print("converting data to torch tensor format...")
    accepted_tensor  = torch.tensor(accepted_numpy)
    rejected_tensor  = torch.tensor(rejected_numpy)
    print("generating labels...")
    accepted_labels = torch.zeros(len(accepted_tensor), dtype=torch.long)
    rejected_labels = torch.ones(len(rejected_tensor), dtype=torch.long)

    print("concatenating accepted and rejected torch tensor datasets...")
    # Concatenate samples and labels
    data = torch.cat([accepted_tensor, rejected_tensor], dim=0)
    labels = torch.cat([accepted_labels, rejected_labels], dim=0)

    print("shuffling data...")
    # Shuffle data and labels together
    indices = torch.randperm(len(data))
    data = data[indices]
    labels = labels[indices]

    # need to split and reorder this so that the labels match
    
    print("splitting data...")
    train_data, train_labels, test_data, test_labels = split_data(data, labels, train_ratio=0.8)

    training_info = {"data":train_data,"labels":train_labels}
    testing_info = {"data":test_data,"labels":test_labels}

    return training_info, testing_info

training_info, testing_info = prepare_data(DFs)

data loaded from files
converting data to numpy array...
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.52499998 0.55000001 ... 0.         0.         0.        ]]
converting data to torch tensor format...
generating labels...
concatenating accepted and rejected torch tensor datasets...
shuffling data...
splitting data...
Data type of training data: torch.float64
Data type of training labels: torch.int64
Data type of testing data: torch.float64
Data type of testing labels: torch.int64


In [15]:
def train_model(training_data):
    # Hyperparameters
    input_size = 9801
    output_size = 1  # Binary classification
    learning_rate = 0.01
    num_epochs = 10

    # Sample data (replace with your actual data)
    X = training_data["data"].dataset.float()  # Ensuring float dtype for model compatibility
    y = training_data["labels"].dataset.float()

    # Create the model
    model = KernelSVM(input_size=input_size, output_size=output_size)

    # Define the optimizer
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        for i in range(len(X)):
            # Forward pass
            output = model(X[i].unsqueeze(0), X[i].unsqueeze(0))  # Ensuring each sample is passed with compatible shapes

            # Calculate loss
            loss = hinge_loss(output, y[i])

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    
    torch.save(model.state_dict(), 'svm_model.pth')
    return model


In [16]:
model_data = train_model(training_info)

Epoch [1/10], Loss: 1.0000
Epoch [2/10], Loss: 1.0000
Epoch [3/10], Loss: 1.0000
Epoch [4/10], Loss: 1.0000
Epoch [5/10], Loss: 1.0000
Epoch [6/10], Loss: 1.0000
Epoch [7/10], Loss: 1.0000
Epoch [8/10], Loss: 1.0000
Epoch [9/10], Loss: 1.0000
Epoch [10/10], Loss: 1.0000
